SonicTriton/src/TritonMemResource.cc

0001 #include "HeterogeneousCore/SonicTriton/interface/TritonData.h"
0002 #include "HeterogeneousCore/SonicTriton/interface/TritonClient.h"
0003 #include "HeterogeneousCore/SonicTriton/interface/TritonMemResource.h"
0004 #include "HeterogeneousCore/SonicTriton/interface/triton_utils.h"
0005 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0006
0007 #include <cstring>
0008 #include <fcntl.h>
0009 #include <sys/mman.h>
0010 #include <unistd.h>
0011
0012 namespace tc = triton::client;
0013
0014 template <typename IO>
0015 TritonMemResource<IO>::TritonMemResource(TritonData<IO>* data, const std::string& name, size_t size)
0016     : data_(data), name_(name), size_(size), addr_(nullptr), closed_(false) {}
0017
0018 template <typename IO>
0019 void TritonMemResource<IO>::set() {
0020   for (auto& entry : data_->entries_) {
0021     TRITON_THROW_IF_ERROR(entry.data_->SetSharedMemory(name_, entry.totalByteSize_, entry.offset_),
0022                           "unable to set shared memory (" + name_ + ")",
0023                           true);
0024   }
0025 }
0026
0027 template <typename IO>
0028 void TritonMemResource<IO>::closeSafe() {
0029   CMS_SA_ALLOW try { close(); } catch (TritonException& e) {
0030     e.convertToWarning();
0031   } catch (cms::Exception& e) {
0032     triton_utils::convertToWarning(e);
0033   } catch (std::exception& e) {
0034     edm::LogWarning("UnknownFailure") << e.what();
0035   } catch (...) {
0036     edm::LogWarning("UnknownFailure") << "An unknown exception was thrown";
0037   }
0038 }
0039
0040 template <typename IO>
0041 TritonHeapResource<IO>::TritonHeapResource(TritonData<IO>* data, const std::string& name, size_t size)
0042     : TritonMemResource<IO>(data, name, size) {}
0043
0044 template <>
0045 void TritonInputHeapResource::copyInput(const void* values, size_t offset, unsigned entry) {
0046   TRITON_THROW_IF_ERROR(data_->entries_[entry].data_->AppendRaw(reinterpret_cast<const uint8_t*>(values),
0047                                                                 data_->entries_[entry].byteSizePerBatch_),
0048                         data_->name_ + " toServer(): unable to set data for batch entry " +
0049                             (data_->entries_.size() > 1 ? std::to_string(entry)
0050                              : data_->entries_[entry].byteSizePerBatch_
0051                                  ? std::to_string(offset / data_->entries_[entry].byteSizePerBatch_)
0052                                  : ""),
0053                         false);
0054 }
0055
0056 template <>
0057 void TritonOutputHeapResource::copyOutput() {
0058   size_t contentByteSize = 0;
0059   for (auto& entry : data_->entries_) {
0060     size_t contentByteSizeEntry(0);
0061     if (entry.totalByteSize_ > 0)
0062       TRITON_THROW_IF_ERROR(entry.result_->RawData(data_->name_, &entry.output_, &contentByteSizeEntry),
0063                             data_->name_ + " fromServer(): unable to get raw",
0064                             false);
0065     contentByteSize += contentByteSizeEntry;
0066   }
0067   if (contentByteSize != data_->totalByteSize_) {
0068     throw cms::Exception("TritonDataError") << data_->name_ << " fromServer(): unexpected content byte size "
0069                                             << contentByteSize << " (expected " << data_->totalByteSize_ << ")";
0070   }
0071 }
0072
0073 //shared memory helpers based on:
0074 // https://github.com/triton-inference-server/server/blob/v2.3.0/src/clients/c++/examples/shm_utils.cc (cpu)
0075 // https://github.com/triton-inference-server/server/blob/v2.3.0/src/clients/c++/examples/simple_grpc_cudashm_client.cc (gpu)
0076
0077 template <typename IO>
0078 TritonCpuShmResource<IO>::TritonCpuShmResource(TritonData<IO>* data, const std::string& name, size_t size)
0079     : TritonMemResource<IO>(data, name, size), sizeOrig_(size) {
0080   //mmap of size zero is required to fail by POSIX, but still need to have some shared memory region available for Triton
0081   this->size_ = std::max<size_t>(this->size_, 1);
0082
0083   //get shared memory region descriptor
0084   int shm_fd = shm_open(this->name_.c_str(), O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
0085   if (shm_fd == -1)
0086     throw cms::Exception("TritonError") << "unable to get shared memory descriptor for key: " + this->name_;
0087
0088   //extend shared memory object
0089   int res = ftruncate(shm_fd, this->size_);
0090   if (res == -1)
0091     throw cms::Exception("TritonError") << "unable to initialize shared memory key " + this->name_ +
0092                                                " to requested size: " + std::to_string(this->size_);
0093
0094   //map to process address space
0095   constexpr size_t offset(0);
0096   this->addr_ = (uint8_t*)mmap(nullptr, this->size_, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, offset);
0097   if (this->addr_ == MAP_FAILED)
0098     throw cms::Exception("TritonError") << "unable to map to process address space for shared memory key: " +
0099                                                this->name_;
0100
0101   //close descriptor
0102   if (::close(shm_fd) == -1)
0103     throw cms::Exception("TritonError") << "unable to close descriptor for shared memory key: " + this->name_;
0104
0105   TRITON_THROW_IF_ERROR(this->data_->client()->RegisterSystemSharedMemory(this->name_, this->name_, this->size_),
0106                         "unable to register shared memory region: " + this->name_,
0107                         true);
0108 }
0109
0110 template <typename IO>
0111 TritonCpuShmResource<IO>::~TritonCpuShmResource() {
0112   this->closeSafe();
0113 }
0114
0115 template <typename IO>
0116 void TritonCpuShmResource<IO>::close() {
0117   if (this->closed_)
0118     return;
0119
0120   TRITON_THROW_IF_ERROR(this->data_->client()->UnregisterSystemSharedMemory(this->name_),
0121                         "unable to unregister shared memory region: " + this->name_,
0122                         true);
0123
0124   //unmap
0125   int tmp_fd = munmap(this->addr_, this->size_);
0126   if (tmp_fd == -1)
0127     throw cms::Exception("TritonError") << "unable to munmap for shared memory key: " << this->name_;
0128
0129   //unlink
0130   int shm_fd = shm_unlink(this->name_.c_str());
0131   if (shm_fd == -1)
0132     throw cms::Exception("TritonError") << "unable to unlink for shared memory key: " << this->name_;
0133
0134   this->closed_ = true;
0135 }
0136
0137 template <>
0138 void TritonInputCpuShmResource::copyInput(const void* values, size_t offset, unsigned entry) {
0139   if (sizeOrig_ > 0)
0140     std::memcpy(addr_ + offset, values, data_->entries_[entry].byteSizePerBatch_);
0141 }
0142
0143 template <>
0144 void TritonOutputCpuShmResource::copyOutput() {
0145   for (auto& entry : data_->entries_) {
0146     entry.output_ = addr_ + entry.offset_;
0147   }
0148 }
0149
0150 template class TritonHeapResource<tc::InferInput>;
0151 template class TritonCpuShmResource<tc::InferInput>;
0152 template class TritonHeapResource<tc::InferRequestedOutput>;
0153 template class TritonCpuShmResource<tc::InferRequestedOutput>;
0154
0155 #ifdef TRITON_ENABLE_GPU
0156 template <typename IO>
0157 TritonGpuShmResource<IO>::TritonGpuShmResource(TritonData<IO>* data, const std::string& name, size_t size)
0158     : TritonMemResource<IO>(data, name, size), deviceId_(0), handle_(std::make_shared<cudaIpcMemHandle_t>()) {
0159   //todo: get server device id somehow?
0160   cudaCheck(cudaSetDevice(deviceId_), "unable to set device ID to " + std::to_string(deviceId_));
0161   cudaCheck(cudaMalloc((void**)&this->addr_, this->size_), "unable to allocate GPU memory for key: " + this->name_);
0162   cudaCheck(cudaIpcGetMemHandle(handle_.get(), this->addr_), "unable to get IPC handle for key: " + this->name_);
0163   TRITON_THROW_IF_ERROR(this->data_->client()->RegisterCudaSharedMemory(this->name_, *handle_, deviceId_, this->size_),
0164                         "unable to register CUDA shared memory region: " + this->name_,
0165                         true);
0166 }
0167
0168 template <typename IO>
0169 TritonGpuShmResource<IO>::~TritonGpuShmResource() {
0170   this->closeSafe();
0171 }
0172
0173 template <typename IO>
0174 void TritonGpuShmResource<IO>::close() {
0175   if (this->closed_)
0176     return;
0177   TRITON_THROW_IF_ERROR(this->data_->client()->UnregisterCudaSharedMemory(this->name_),
0178                         "unable to unregister CUDA shared memory region: " + this->name_,
0179                         true);
0180   cudaCheck(cudaFree(this->addr_), "unable to free GPU memory for key: " + this->name_);
0181   this->closed_ = true;
0182 }
0183
0184 template <>
0185 void TritonInputGpuShmResource::copyInput(const void* values, size_t offset, unsigned entry) {
0186   cudaCheck(cudaMemcpy(addr_ + offset, values, data_->entries_[entry].byteSizePerBatch_, cudaMemcpyHostToDevice),
0187             data_->name_ + " toServer(): unable to memcpy " + std::to_string(data_->entries_[entry].byteSizePerBatch_) +
0188                 " bytes to GPU");
0189 }
0190
0191 template <>
0192 void TritonOutputGpuShmResource::copyOutput() {
0193   //copy back from gpu, keep in scope
0194   auto ptr = std::make_shared<std::vector<uint8_t>>(data_->totalByteSize_);
0195   cudaCheck(
0196       cudaMemcpy(ptr->data(), addr_, data_->totalByteSize_, cudaMemcpyDeviceToHost),
0197       data_->name_ + " fromServer(): unable to memcpy " + std::to_string(data_->totalByteSize_) + " bytes from GPU");
0198   data_->holder_ = ptr;
0199   for (auto& entry : data_->entries_) {
0200     entry.output_ = ptr->data() + entry.offset_;
0201   }
0202 }
0203
0204 template class TritonGpuShmResource<tc::InferInput>;
0205 template class TritonGpuShmResource<tc::InferRequestedOutput>;
0206 #endif