File indexing completed on 2024-04-06 12:15:45
0001 #ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0002 #define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041 #include <cmath>
0042 #include <map>
0043 #include <set>
0044 #include <mutex>
0045
0046 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0047
0048
0049 namespace notcub {
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100 struct CachingHostAllocator {
0101
0102
0103
0104
0105
0106 static const unsigned int INVALID_BIN = (unsigned int)-1;
0107
0108
0109 static const size_t INVALID_SIZE = (size_t)-1;
0110
0111 #ifndef DOXYGEN_SHOULD_SKIP_THIS
0112
0113
0114 static const int INVALID_DEVICE_ORDINAL = -1;
0115
0116
0117
0118
0119
0120
0121
0122
0123 struct BlockDescriptor {
0124 void *d_ptr;
0125 size_t bytes;
0126 unsigned int bin;
0127 int device;
0128 cudaStream_t associated_stream;
0129 cudaEvent_t ready_event;
0130
0131
0132 BlockDescriptor(void *d_ptr)
0133 : d_ptr(d_ptr),
0134 bytes(0),
0135 bin(INVALID_BIN),
0136 device(INVALID_DEVICE_ORDINAL),
0137 associated_stream(nullptr),
0138 ready_event(nullptr) {}
0139
0140
0141 BlockDescriptor()
0142 : d_ptr(nullptr),
0143 bytes(0),
0144 bin(INVALID_BIN),
0145 device(INVALID_DEVICE_ORDINAL),
0146 associated_stream(nullptr),
0147 ready_event(nullptr) {}
0148
0149
0150 static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
0151
0152
0153 static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
0154 };
0155
0156
0157 typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
0158
0159 class TotalBytes {
0160 public:
0161 size_t free;
0162 size_t live;
0163 TotalBytes() { free = live = 0; }
0164 };
0165
0166
0167 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
0168
0169
0170 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
0171
0172
0173
0174
0175
0176
0177
0178
0179 static unsigned int IntPow(unsigned int base, unsigned int exp) {
0180 unsigned int retval = 1;
0181 while (exp > 0) {
0182 if (exp & 1) {
0183 retval = retval * base;
0184 }
0185 base = base * base;
0186 exp = exp >> 1;
0187 }
0188 return retval;
0189 }
0190
0191
0192
0193
0194 void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
0195 power = 0;
0196 rounded_bytes = 1;
0197
0198 if (value * base < value) {
0199
0200 power = sizeof(size_t) * 8;
0201 rounded_bytes = size_t(0) - 1;
0202 return;
0203 }
0204
0205 while (rounded_bytes < value) {
0206 rounded_bytes *= base;
0207 power++;
0208 }
0209 }
0210
0211
0212
0213
0214
0215 std::mutex mutex;
0216
0217 unsigned int bin_growth;
0218 unsigned int min_bin;
0219 unsigned int max_bin;
0220
0221 size_t min_bin_bytes;
0222 size_t max_bin_bytes;
0223 size_t max_cached_bytes;
0224
0225 const bool
0226 skip_cleanup;
0227 bool debug;
0228
0229 TotalBytes cached_bytes;
0230 CachedBlocks cached_blocks;
0231 BusyBlocks live_blocks;
0232
0233 #endif
0234
0235
0236
0237
0238
0239
0240
0241
0242 CachingHostAllocator(
0243 unsigned int bin_growth,
0244 unsigned int min_bin = 1,
0245 unsigned int max_bin = INVALID_BIN,
0246 size_t max_cached_bytes = INVALID_SIZE,
0247 bool skip_cleanup =
0248 false,
0249 bool debug = false)
0250 : bin_growth(bin_growth),
0251 min_bin(min_bin),
0252 max_bin(max_bin),
0253 min_bin_bytes(IntPow(bin_growth, min_bin)),
0254 max_bin_bytes(IntPow(bin_growth, max_bin)),
0255 max_cached_bytes(max_cached_bytes),
0256 skip_cleanup(skip_cleanup),
0257 debug(debug),
0258 cached_blocks(BlockDescriptor::SizeCompare),
0259 live_blocks(BlockDescriptor::PtrCompare) {}
0260
0261
0262
0263
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274 CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
0275 : bin_growth(8),
0276 min_bin(3),
0277 max_bin(7),
0278 min_bin_bytes(IntPow(bin_growth, min_bin)),
0279 max_bin_bytes(IntPow(bin_growth, max_bin)),
0280 max_cached_bytes((max_bin_bytes * 3) - 1),
0281 skip_cleanup(skip_cleanup),
0282 debug(debug),
0283 cached_blocks(BlockDescriptor::SizeCompare),
0284 live_blocks(BlockDescriptor::PtrCompare) {}
0285
0286
0287
0288
0289
0290
0291
0292 void SetMaxCachedBytes(size_t max_cached_bytes) {
0293
0294 std::unique_lock mutex_locker(mutex);
0295
0296 if (debug)
0297 printf("Changing max_cached_bytes (%lld -> %lld)\n",
0298 (long long)this->max_cached_bytes,
0299 (long long)max_cached_bytes);
0300
0301 this->max_cached_bytes = max_cached_bytes;
0302
0303
0304 mutex_locker.unlock();
0305 }
0306
0307
0308
0309
0310
0311
0312 cudaError_t HostAllocate(
0313 void **d_ptr,
0314 size_t bytes,
0315 cudaStream_t active_stream = nullptr)
0316 {
0317 std::unique_lock<std::mutex> mutex_locker(mutex, std::defer_lock);
0318 *d_ptr = nullptr;
0319 int device = INVALID_DEVICE_ORDINAL;
0320 cudaError_t error = cudaSuccess;
0321
0322 cudaCheck(error = cudaGetDevice(&device));
0323
0324
0325 bool found = false;
0326 BlockDescriptor search_key;
0327 search_key.device = device;
0328 search_key.associated_stream = active_stream;
0329 NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
0330
0331 if (search_key.bin > max_bin) {
0332
0333
0334
0335 search_key.bin = INVALID_BIN;
0336 search_key.bytes = bytes;
0337 } else {
0338
0339 mutex_locker.lock();
0340
0341 if (search_key.bin < min_bin) {
0342
0343 search_key.bin = min_bin;
0344 search_key.bytes = min_bin_bytes;
0345 }
0346
0347
0348 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
0349 while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
0350
0351
0352 if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
0353
0354 found = true;
0355 search_key = *block_itr;
0356 search_key.associated_stream = active_stream;
0357 if (search_key.device != device) {
0358
0359 cudaCheck(error = cudaSetDevice(search_key.device));
0360 cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0361 cudaCheck(error = cudaSetDevice(device));
0362 cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0363 search_key.device = device;
0364 }
0365
0366 live_blocks.insert(search_key);
0367
0368
0369 cached_bytes.free -= search_key.bytes;
0370 cached_bytes.live += search_key.bytes;
0371
0372 if (debug)
0373 printf(
0374 "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
0375 "(previously associated with stream %lld, event %lld).\n",
0376 search_key.d_ptr,
0377 (long long)search_key.bytes,
0378 (long long)search_key.associated_stream,
0379 (long long)search_key.ready_event,
0380 (long long)search_key.device,
0381 (long long)block_itr->associated_stream,
0382 (long long)block_itr->ready_event);
0383
0384 cached_blocks.erase(block_itr);
0385
0386 break;
0387 }
0388 block_itr++;
0389 }
0390
0391
0392 mutex_locker.unlock();
0393 }
0394
0395
0396 if (!found) {
0397
0398
0399 if ((error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
0400 cudaErrorMemoryAllocation) {
0401
0402 if (debug)
0403 printf(
0404 "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
0405 "allocations",
0406 (long long)search_key.bytes,
0407 (long long)search_key.associated_stream,
0408 (long long)search_key.device);
0409
0410 error = cudaSuccess;
0411 cudaGetLastError();
0412
0413
0414 mutex_locker.lock();
0415
0416
0417 CachedBlocks::iterator block_itr = cached_blocks.begin();
0418
0419 while ((block_itr != cached_blocks.end())) {
0420
0421
0422
0423
0424
0425 if ((error = cudaFreeHost(block_itr->d_ptr)))
0426 break;
0427 if ((error = cudaEventDestroy(block_itr->ready_event)))
0428 break;
0429
0430
0431 cached_bytes.free -= block_itr->bytes;
0432
0433 if (debug)
0434 printf(
0435 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0436 "bytes) outstanding.\n",
0437 (long long)block_itr->bytes,
0438 (long long)cached_blocks.size(),
0439 (long long)cached_bytes.free,
0440 (long long)live_blocks.size(),
0441 (long long)cached_bytes.live);
0442
0443 cached_blocks.erase(block_itr);
0444
0445 block_itr++;
0446 }
0447
0448
0449 mutex_locker.unlock();
0450
0451
0452 if (error)
0453 return error;
0454
0455
0456 cudaCheck(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault));
0457 }
0458
0459
0460 cudaCheck(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
0461
0462
0463 mutex_locker.lock();
0464 live_blocks.insert(search_key);
0465 cached_bytes.live += search_key.bytes;
0466 mutex_locker.unlock();
0467
0468 if (debug)
0469 printf(
0470 "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
0471 "%lld).\n",
0472 search_key.d_ptr,
0473 (long long)search_key.bytes,
0474 (long long)search_key.associated_stream,
0475 (long long)search_key.ready_event,
0476 (long long)search_key.device);
0477 }
0478
0479
0480 *d_ptr = search_key.d_ptr;
0481
0482 if (debug)
0483 printf("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
0484 (long long)cached_blocks.size(),
0485 (long long)cached_bytes.free,
0486 (long long)live_blocks.size(),
0487 (long long)cached_bytes.live);
0488
0489 return error;
0490 }
0491
0492
0493
0494
0495
0496
0497 cudaError_t HostFree(void *d_ptr) {
0498 int entrypoint_device = INVALID_DEVICE_ORDINAL;
0499 cudaError_t error = cudaSuccess;
0500
0501
0502 std::unique_lock<std::mutex> mutex_locker(mutex);
0503
0504
0505 bool recached = false;
0506 BlockDescriptor search_key(d_ptr);
0507 BusyBlocks::iterator block_itr = live_blocks.find(search_key);
0508 if (block_itr != live_blocks.end()) {
0509
0510 search_key = *block_itr;
0511 live_blocks.erase(block_itr);
0512 cached_bytes.live -= search_key.bytes;
0513
0514
0515 if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
0516
0517 recached = true;
0518 cached_blocks.insert(search_key);
0519 cached_bytes.free += search_key.bytes;
0520
0521 if (debug)
0522 printf(
0523 "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
0524 "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
0525 (long long)search_key.bytes,
0526 (long long)search_key.associated_stream,
0527 (long long)search_key.ready_event,
0528 (long long)search_key.device,
0529 (long long)cached_blocks.size(),
0530 (long long)cached_bytes.free,
0531 (long long)live_blocks.size(),
0532 (long long)cached_bytes.live);
0533 }
0534 }
0535
0536 cudaCheck(error = cudaGetDevice(&entrypoint_device));
0537 if (entrypoint_device != search_key.device) {
0538 cudaCheck(error = cudaSetDevice(search_key.device));
0539 }
0540
0541 if (recached) {
0542
0543 cudaCheck(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream));
0544 }
0545
0546
0547 mutex_locker.unlock();
0548
0549 if (!recached) {
0550
0551 cudaCheck(error = cudaFreeHost(d_ptr));
0552 cudaCheck(error = cudaEventDestroy(search_key.ready_event));
0553
0554 if (debug)
0555 printf(
0556 "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available "
0557 "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
0558 (long long)search_key.bytes,
0559 (long long)search_key.associated_stream,
0560 (long long)search_key.ready_event,
0561 (long long)search_key.device,
0562 (long long)cached_blocks.size(),
0563 (long long)cached_bytes.free,
0564 (long long)live_blocks.size(),
0565 (long long)cached_bytes.live);
0566 }
0567
0568
0569 if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
0570 cudaCheck(error = cudaSetDevice(entrypoint_device));
0571 }
0572
0573 return error;
0574 }
0575
0576
0577
0578
0579 cudaError_t FreeAllCached() {
0580 cudaError_t error = cudaSuccess;
0581 int entrypoint_device = INVALID_DEVICE_ORDINAL;
0582 int current_device = INVALID_DEVICE_ORDINAL;
0583
0584 std::unique_lock<std::mutex> mutex_locker(mutex);
0585
0586 while (!cached_blocks.empty()) {
0587
0588 CachedBlocks::iterator begin = cached_blocks.begin();
0589
0590
0591 if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
0592 if ((error = cudaGetDevice(&entrypoint_device)))
0593 break;
0594 }
0595
0596
0597 if (begin->device != current_device) {
0598 if ((error = cudaSetDevice(begin->device)))
0599 break;
0600 current_device = begin->device;
0601 }
0602
0603
0604 if ((error = cudaFreeHost(begin->d_ptr)))
0605 break;
0606 if ((error = cudaEventDestroy(begin->ready_event)))
0607 break;
0608
0609
0610 cached_bytes.free -= begin->bytes;
0611
0612 if (debug)
0613 printf(
0614 "\tHost freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
0615 "bytes) outstanding.\n",
0616 (long long)begin->bytes,
0617 (long long)cached_blocks.size(),
0618 (long long)cached_bytes.free,
0619 (long long)live_blocks.size(),
0620 (long long)cached_bytes.live);
0621
0622 cached_blocks.erase(begin);
0623 }
0624
0625 mutex_locker.unlock();
0626
0627
0628 if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
0629 cudaCheck(error = cudaSetDevice(entrypoint_device));
0630 }
0631
0632 return error;
0633 }
0634
0635
0636
0637
0638 ~CachingHostAllocator() {
0639 if (!skip_cleanup)
0640 FreeAllCached();
0641 }
0642 };
0643
0644
0645
0646 }
0647
0648 #endif