Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
#include <cuda.h>

#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
#include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "Geometry/CommonDetUnit/interface/GeomDetType.h"
#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains,
                                                                 const TrackerGeometry& geom)
    : gains_(&gains) {
  // bizzarre logic (looking for fist strip-det) don't ask
  auto const& dus = geom.detUnits();
  unsigned int n_detectors = dus.size();
  for (unsigned int i = 1; i < 7; ++i) {
    const auto offset = geom.offsetDU(GeomDetEnumerators::tkDetEnum[i]);
    if (offset != dus.size() && dus[offset]->type().isTrackerStrip()) {
      if (n_detectors > offset)
        n_detectors = offset;
    }
  }

  LogDebug("SiPixelGainCalibrationForHLTGPU")
      << "caching calibs for " << n_detectors << " pixel detectors of size " << gains.data().size() << '\n'
      << "sizes " << sizeof(char) << ' ' << sizeof(uint8_t) << ' ' << sizeof(SiPixelGainForHLTonGPU::DecodingStructure);

  cudaCheck(cudaMallocHost((void**)&gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU)));
  gainForHLTonHost_->v_pedestals_ =
      (SiPixelGainForHLTonGPU_DecodingStructure*)this->gains_->data().data();  // so it can be used on CPU as well...

  // do not read back from the (possibly write-combined) memory buffer
  auto minPed = gains.getPedLow();
  auto maxPed = gains.getPedHigh();
  auto minGain = gains.getGainLow();
  auto maxGain = gains.getGainHigh();
  auto nBinsToUseForEncoding = 253;

  // we will simplify later (not everything is needed....)
  gainForHLTonHost_->minPed_ = minPed;
  gainForHLTonHost_->maxPed_ = maxPed;
  gainForHLTonHost_->minGain_ = minGain;
  gainForHLTonHost_->maxGain_ = maxGain;

  gainForHLTonHost_->numberOfRowsAveragedOver_ = 80;
  gainForHLTonHost_->nBinsToUseForEncoding_ = nBinsToUseForEncoding;
  gainForHLTonHost_->deadFlag_ = 255;
  gainForHLTonHost_->noisyFlag_ = 254;

  gainForHLTonHost_->pedPrecision_ = static_cast<float>(maxPed - minPed) / nBinsToUseForEncoding;
  gainForHLTonHost_->gainPrecision_ = static_cast<float>(maxGain - minGain) / nBinsToUseForEncoding;

  LogDebug("SiPixelGainCalibrationForHLTGPU")
      << "precisions g " << gainForHLTonHost_->pedPrecision_ << ' ' << gainForHLTonHost_->gainPrecision_;

  // fill the index map
  auto const& ind = gains.getIndexes();
  LogDebug("SiPixelGainCalibrationForHLTGPU") << ind.size() << " " << n_detectors;

  for (auto i = 0U; i < n_detectors; ++i) {
    auto p = std::lower_bound(
        ind.begin(), ind.end(), dus[i]->geographicalId().rawId(), SiPixelGainCalibrationForHLT::StrictWeakOrdering());
    assert(p != ind.end() && p->detid == dus[i]->geographicalId());
    assert(p->iend <= gains.data().size());
    assert(p->iend >= p->ibegin);
    assert(0 == p->ibegin % 2);
    assert(0 == p->iend % 2);
    assert(p->ibegin != p->iend);
    assert(p->ncols > 0);
    gainForHLTonHost_->rangeAndCols_[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(p->ibegin, p->iend), p->ncols);
    if (ind[i].detid != dus[i]->geographicalId())
      LogDebug("SiPixelGainCalibrationForHLTGPU") << ind[i].detid << "!=" << dus[i]->geographicalId();
  }
}

SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { cudaCheck(cudaFreeHost(gainForHLTonHost_)); }

SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
  cudaCheck(cudaFree(gainForHLTonGPU));
  cudaCheck(cudaFree(gainDataOnGPU));
}

const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cudaStream_t cudaStream) const {
  const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
    cudaCheck(cudaMalloc((void**)&data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU)));
    cudaCheck(cudaMalloc((void**)&data.gainDataOnGPU, this->gains_->data().size()));
    // gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory
    cudaCheck(cudaMemcpyAsync(
        data.gainDataOnGPU, this->gains_->data().data(), this->gains_->data().size(), cudaMemcpyDefault, stream));

    cudaCheck(cudaMemcpyAsync(
        data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream));
    cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals_),
                              &(data.gainDataOnGPU),
                              sizeof(SiPixelGainForHLTonGPU_DecodingStructure*),
                              cudaMemcpyDefault,
                              stream));
  });
  return data.gainForHLTonGPU;
}