plugins/alpaka/UnpackPortable.dev.cc

0001 #include <alpaka/alpaka.hpp>
0002
0003 #include "DataFormats/DetId/interface/DetId.h"
0004 #include "DataFormats/EcalDigi/interface/EcalConstants.h"
0005 #include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h"
0006 #include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h"
0007 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
0008
0009 #include "UnpackPortable.h"
0010
0011 namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw {
0012
0013   using namespace ::ecal::raw;
0014   using namespace cms::alpakatools;
0015
0016   class Kernel_unpack {
0017   public:
0018     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
0019     ALPAKA_FN_ACC void operator()(TAcc const& acc,
0020                                   unsigned char const* __restrict__ data,
0021                                   uint32_t const* __restrict__ offsets,
0022                                   int const* __restrict__ feds,
0023                                   EcalDigiDeviceCollection::View digisDevEB,
0024                                   EcalDigiDeviceCollection::View digisDevEE,
0025                                   EcalElectronicsMappingDevice::ConstView eid2did,
0026                                   uint32_t const nbytesTotal) const {
0027       constexpr auto kSampleSize = ecalPh1::sampleSize;
0028
0029       // indices
0030       auto const ifed = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0031       auto const threadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u];
0032
0033       // offset in bytes
0034       auto const offset = offsets[ifed];
0035       // fed id
0036       auto const fed = feds[ifed];
0037       auto const isBarrel = is_barrel(static_cast<uint8_t>(fed - 600));
0038       // size
0039       auto const gridDim = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u];
0040       auto const size = ifed == gridDim - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset;
0041       auto* samples = isBarrel ? digisDevEB.data()->data() : digisDevEE.data()->data();
0042       auto* ids = isBarrel ? digisDevEB.id() : digisDevEE.id();
0043       auto* pChannelsCounter = isBarrel ? &digisDevEB.size() : &digisDevEE.size();
0044
0045       // offset to the right raw buffer
0046       uint64_t const* buffer = reinterpret_cast<uint64_t const*>(data + offset);
0047
0048       // dump first 3 bits for each 64-bit word
0049       //print_first3bits(buffer, size / 8);
0050
0051       //
0052       // fed header
0053       //
0054       auto const fed_header = buffer[0];
0055       uint32_t bx = (fed_header >> H_BX_B) & H_BX_MASK;
0056       uint32_t lv1 = (fed_header >> H_L1_B) & H_L1_MASK;
0057       uint32_t triggerType = (fed_header >> H_TTYPE_B) & H_TTYPE_MASK;
0058
0059       // determine the number of FE channels from the trigger type
0060       uint32_t numbChannels(0);
0061       if (triggerType == PHYSICTRIGGER) {
0062         numbChannels = NUMB_FE;
0063       } else if (triggerType == CALIBRATIONTRIGGER) {
0064         numbChannels = NUMB_FE + 2;  // FE + 2 MEM blocks
0065       } else {
0066         // unsupported trigger type
0067         return;
0068       }
0069
0070       // 9 for fed + dcc header
0071       // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block
0072       // 6 for SR block size
0073
0074       // dcc header w2
0075       auto const w2 = buffer[2];
0076       uint8_t const fov = (w2 >> H_FOV_B) & H_FOV_MASK;
0077
0078       // make a list of channels with data from DCC header channels status
0079       // this could be done for each block instead of each thread since it defined per FED
0080       uint8_t exp_ttids[NUMB_FE + 2];  // FE + 2 MEM blocks
0081       uint8_t ch = 1;
0082       uint8_t nCh = 0;
0083       for (uint8_t i = 4; i < 9; ++i) {           // data words with channel status info
0084         for (uint8_t j = 0; j < 14; ++j, ++ch) {  // channel status fields in one data word
0085           const uint8_t shift = j * 4;            //each channel has 4 bits
0086           const int chStatus = (buffer[i] >> shift) & H_CHSTATUS_MASK;
0087           const bool regular = (chStatus == CH_DISABLED || chStatus == CH_SUPPRESS);
0088           const bool problematic =
0089               (chStatus == CH_TIMEOUT || chStatus == CH_HEADERERR || chStatus == CH_LINKERR ||
0090                chStatus == CH_LENGTHERR || chStatus == CH_IFIFOFULL || chStatus == CH_L1AIFIFOFULL);
0091           if (!(regular || problematic)) {
0092             exp_ttids[nCh] = ch;
0093             ++nCh;
0094           }
0095         }
0096       }
0097
0098       //
0099       // print Tower block headers
0100       //
0101       uint8_t ntccblockwords = isBarrel ? 18 : 36;
0102       auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6;
0103       auto const* trailer = buffer + (size / 8 - 1);
0104       auto const* current_tower_block = tower_blocks_start;
0105       uint8_t iCh = 0;
0106       uint8_t next_tower_id = exp_ttids[iCh];
0107       while (current_tower_block < trailer && iCh < numbChannels) {
0108         auto const w = *current_tower_block;
0109         uint8_t ttid = w & TOWER_ID_MASK;
0110         uint16_t bxlocal = (w >> TOWER_BX_B) & TOWER_BX_MASK;
0111         uint16_t lv1local = (w >> TOWER_L1_B) & TOWER_L1_MASK;
0112         uint16_t block_length = (w >> TOWER_LENGTH_B) & TOWER_LENGTH_MASK;
0113
0114         // fast forward to the next good tower id (in case of recovery from an earlier header corruption)
0115         while (exp_ttids[iCh] < next_tower_id) {
0116           ++iCh;
0117         }
0118         ++iCh;
0119
0120         // check if the tower id in the tower header is the one expected
0121         // if not try to find the next good header, point the current_tower_block to it, and extract its tower id
0122         // or break if there is none
0123         if (ttid != next_tower_id) {
0124           next_tower_id = find_next_tower_block(current_tower_block, trailer, bx, lv1);
0125           if (next_tower_id < TOWER_ID_MASK) {
0126             continue;
0127           } else {
0128             break;
0129           }
0130         }
0131
0132         // prepare for the next iteration
0133         next_tower_id = exp_ttids[iCh];
0134
0135         uint16_t const dccbx = bx & 0xfff;
0136         uint16_t const dccl1 = lv1 & 0xfff;
0137         // fov>=1 is required to support simulated data for which bx==bxlocal==0
0138         if (fov >= 1 && !is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) {
0139           current_tower_block += block_length;
0140           continue;
0141         }
0142
0143         // go through all the channels
0144         // get the next channel coordinates
0145         uint32_t const nchannels = (block_length - 1) / 3;
0146
0147         bool bad_block = false;
0148         auto& ch_with_bad_block = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
0149         if (once_per_block(acc)) {
0150           ch_with_bad_block = std::numeric_limits<uint32_t>::max();
0151         }
0152         // make sure the shared memory is initialised for all threads
0153         alpaka::syncBlockThreads(acc);
0154
0155         auto const threadsPerBlock = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u];
0156         // 1 threads per channel in this block
0157         // All threads enter the loop regardless if they will treat channel indices channel >= nchannels.
0158         // The threads with excess indices perform no operations but also reach the syncBlockThreads() inside the loop.
0159         for (uint32_t i = 0; i < nchannels; i += threadsPerBlock) {
0160           auto const channel = i + threadIdx;
0161
0162           uint64_t wdata;
0163           uint8_t stripid;
0164           uint8_t xtalid;
0165
0166           // threads must be inside the range (no break here because of syncBlockThreads() afterwards)
0167           if (channel < nchannels && channel < ch_with_bad_block) {
0168             // inc the channel's counter and get the pos where to store
0169             wdata = current_tower_block[1 + channel * 3];
0170             stripid = wdata & 0x7;
0171             xtalid = (wdata >> 4) & 0x7;
0172
0173             // check if the stripid and xtalid are in the allowed range and if not skip the rest of the block
0174             if (stripid < ElectronicsIdGPU::MIN_STRIPID || stripid > ElectronicsIdGPU::MAX_STRIPID ||
0175                 xtalid < ElectronicsIdGPU::MIN_XTALID || xtalid > ElectronicsIdGPU::MAX_XTALID) {
0176               bad_block = true;
0177             }
0178             if (channel > 0) {
0179               // check if the stripid has increased or that the xtalid has increased from the previous data word. If not something is wrong and the rest of the block is skipped.
0180               auto const prev_channel = channel - 1;
0181               auto const prevwdata = current_tower_block[1 + prev_channel * 3];
0182               uint8_t const laststripid = prevwdata & 0x7;
0183               uint8_t const lastxtalid = (prevwdata >> 4) & 0x7;
0184               if ((stripid == laststripid && xtalid <= lastxtalid) || (stripid < laststripid)) {
0185                 bad_block = true;
0186               }
0187             }
0188           }
0189
0190           // check if this thread has the lowest bad block
0191           if (bad_block && channel < ch_with_bad_block) {
0192             alpaka::atomicMin(acc, &ch_with_bad_block, channel, alpaka::hierarchy::Threads{});
0193           }
0194
0195           // make sure that all threads that have to have set the ch_with_bad_block shared memory
0196           alpaka::syncBlockThreads(acc);
0197
0198           // threads outside of the range or bad block detected in this thread or one working on a lower block -> stop this loop iteration here
0199           if (channel >= nchannels || channel >= ch_with_bad_block) {
0200             continue;
0201           }
0202
0203           ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid};
0204           auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()].rawid();
0205           // skip channels with an invalid detid
0206           if (didraw == 0)
0207             continue;
0208
0209           // get samples
0210           uint16_t sampleValues[kSampleSize];
0211           sampleValues[0] = (wdata >> 16) & 0x3fff;
0212           sampleValues[1] = (wdata >> 32) & 0x3fff;
0213           sampleValues[2] = (wdata >> 48) & 0x3fff;
0214           auto const wdata1 = current_tower_block[2 + channel * 3];
0215           sampleValues[3] = wdata1 & 0x3fff;
0216           sampleValues[4] = (wdata1 >> 16) & 0x3fff;
0217           sampleValues[5] = (wdata1 >> 32) & 0x3fff;
0218           sampleValues[6] = (wdata1 >> 48) & 0x3fff;
0219           auto const wdata2 = current_tower_block[3 + channel * 3];
0220           sampleValues[7] = wdata2 & 0x3fff;
0221           sampleValues[8] = (wdata2 >> 16) & 0x3fff;
0222           sampleValues[9] = (wdata2 >> 32) & 0x3fff;
0223
0224           // check gain
0225           bool isSaturation = true;
0226           short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1};
0227           for (uint32_t si = 0; si < kSampleSize; ++si) {
0228             if (gainId(sampleValues[si]) == 0) {
0229               firstGainZeroSampID = si;
0230               firstGainZeroSampADC = adc(sampleValues[si]);
0231               break;
0232             }
0233           }
0234           if (firstGainZeroSampID != -1) {
0235             unsigned int plateauEnd = std::min(kSampleSize, (unsigned int)(firstGainZeroSampID + 5));
0236             for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) {
0237               if (!(gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC)) {
0238                 isSaturation = false;
0239                 break;
0240               }  //it's not saturation
0241             }
0242             // get rid of channels which are stuck in gain0
0243             if (firstGainZeroSampID < 3) {
0244               isSaturation = false;
0245             }
0246             if (!isSaturation)
0247               continue;
0248           } else {  // there is no zero gainId sample
0249             // gain switch check
0250             short numGain = 1;
0251             bool gainSwitchError = false;
0252             for (unsigned int si = 1; si < kSampleSize; ++si) {
0253               if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5)
0254                 gainSwitchError = true;
0255               if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si]))
0256                 numGain++;
0257               else
0258                 numGain = 1;
0259             }
0260             if (gainSwitchError)
0261               continue;
0262           }
0263
0264           auto const pos = alpaka::atomicAdd(acc, pChannelsCounter, 1u, alpaka::hierarchy::Threads{});
0265
0266           // store to global
0267           ids[pos] = didraw;
0268           std::memcpy(&samples[pos * kSampleSize], sampleValues, kSampleSize * sizeof(uint16_t));
0269         }
0270
0271         current_tower_block += block_length;
0272       }
0273     }
0274
0275   private:
0276     ALPAKA_FN_INLINE ALPAKA_FN_ACC void print_raw_buffer(uint8_t const* const buffer,
0277                                                          uint32_t const nbytes,
0278                                                          uint32_t const nbytes_per_row = 20) const {
0279       for (uint32_t i = 0; i < nbytes; ++i) {
0280         if (i % nbytes_per_row == 0 && i > 0)
0281           printf("\n");
0282         printf("%02X ", buffer[i]);
0283       }
0284     }
0285
0286     ALPAKA_FN_INLINE ALPAKA_FN_ACC void print_first3bits(uint64_t const* buffer, uint32_t size) const {
0287       for (uint32_t i = 0; i < size; ++i) {
0288         uint8_t const b61 = (buffer[i] >> 61) & 0x1;
0289         uint8_t const b62 = (buffer[i] >> 62) & 0x1;
0290         uint8_t const b63 = (buffer[i] >> 63) & 0x1;
0291         printf("[word: %u] %u%u%u\n", i, b63, b62, b61);
0292       }
0293     }
0294
0295     ALPAKA_FN_INLINE ALPAKA_FN_ACC bool is_barrel(uint8_t dccid) const {
0296       return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP;
0297     }
0298
0299     ALPAKA_FN_INLINE ALPAKA_FN_ACC uint8_t fed2dcc(int fed) const { return static_cast<uint8_t>(fed - 600); }
0300
0301     ALPAKA_FN_INLINE ALPAKA_FN_ACC int zside_for_eb(ElectronicsIdGPU const& eid) const {
0302       int dcc = eid.dccId();
0303       return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1;
0304     }
0305
0306     ALPAKA_FN_INLINE ALPAKA_FN_ACC uint8_t find_next_tower_block(uint64_t const*& current_tower_block,
0307                                                                  uint64_t const* trailer,
0308                                                                  uint32_t const bx,
0309                                                                  uint32_t const lv1) const {
0310       const auto* next_tower_block = current_tower_block + 1;  // move forward to skip the broken header
0311
0312       // expected LV1, BX, #TS
0313       const uint64_t lv1local = ((lv1 - 1) & TOWER_L1_MASK);
0314       const uint64_t bxlocal = (bx != 3564) ? bx : 0;
0315       // The CPU unpacker also checks the # time samples expected in the header
0316       // but those are currently not available here
0317
0318       // construct tower header and mask
0319       const uint64_t sign = 0xC0000000C0000000 + (lv1local << TOWER_L1_B) + (bxlocal << TOWER_BX_B);
0320       const uint64_t mask =
0321           0xC0001000D0000000 + (uint64_t(TOWER_L1_MASK) << TOWER_L1_B) + (uint64_t(TOWER_BX_MASK) << TOWER_BX_B);
0322
0323       while (next_tower_block < trailer) {
0324         if ((*next_tower_block & mask) == sign) {
0325           current_tower_block = next_tower_block;
0326           return uint8_t(*next_tower_block & TOWER_ID_MASK);
0327         } else {
0328           ++next_tower_block;
0329         }
0330       }
0331       return TOWER_ID_MASK;  // return the maximum value
0332     }
0333
0334     ALPAKA_FN_INLINE ALPAKA_FN_ACC bool is_synced_towerblock(uint16_t const dccbx,
0335                                                              uint16_t const bx,
0336                                                              uint16_t const dccl1,
0337                                                              uint16_t const l1) const {
0338       bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564);
0339       bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff));
0340       return bxsync && l1sync;
0341     }
0342
0343     ALPAKA_FN_INLINE ALPAKA_FN_ACC bool right_tower_for_eb(int tower) const {
0344       // for EB, two types of tower (LVRB top/bottom)
0345       return (tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) ||
0346              (tower > 60 && tower < 69);
0347     }
0348
0349     ALPAKA_FN_INLINE ALPAKA_FN_ACC uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) const {
0350       // as in Geometry/EcalMaping/.../EcalElectronicsMapping
0351       auto const dcc = eid.dccId();
0352       auto const tower = eid.towerId();
0353       auto const strip = eid.stripId();
0354       auto const xtal = eid.xtalId();
0355
0356       int smid = 0;
0357       int iphi = 0;
0358       bool EBPlus = (zside_for_eb(eid) > 0);
0359       bool EBMinus = !EBPlus;
0360
0361       if (zside_for_eb(eid) < 0) {
0362         smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM;
0363         iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi;
0364         iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi);
0365       } else {
0366         smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP;
0367         iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi;
0368         iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1);
0369       }
0370
0371       bool RightTower = right_tower_for_eb(tower);
0372       int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1;
0373       if (RightTower) {
0374         ieta += (strip - 1);
0375         if (strip % 2 == 1) {
0376           if (EBMinus)
0377             iphi += (xtal - 1) + 1;
0378           else
0379             iphi += (4 - (xtal - 1)) + 1;
0380         } else {
0381           if (EBMinus)
0382             iphi += (4 - (xtal - 1)) + 1;
0383           else
0384             iphi += (xtal - 1) + 1;
0385         }
0386       } else {
0387         ieta += 4 - (strip - 1);
0388         if (strip % 2 == 1) {
0389           if (EBMinus)
0390             iphi += (4 - (xtal - 1)) + 1;
0391           else
0392             iphi += (xtal - 1) + 1;
0393         } else {
0394           if (EBMinus)
0395             iphi += (xtal - 1) + 1;
0396           else
0397             iphi += (4 - (xtal - 1)) + 1;
0398         }
0399       }
0400
0401       if (zside_for_eb(eid) < 0)
0402         ieta = -ieta;
0403
0404       DetId did{DetId::Ecal, EcalBarrel};
0405       return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF);
0406     }
0407
0408     ALPAKA_FN_INLINE ALPAKA_FN_ACC int adc(uint16_t sample) const { return sample & 0xfff; }
0409
0410     ALPAKA_FN_INLINE ALPAKA_FN_ACC int gainId(uint16_t sample) const { return (sample >> 12) & 0x3; }
0411   };
0412
0413   void unpackRaw(Queue& queue,
0414                  InputDataHost const& inputHost,
0415                  EcalDigiDeviceCollection& digisDevEB,
0416                  EcalDigiDeviceCollection& digisDevEE,
0417                  EcalElectronicsMappingDevice const& mapping,
0418                  uint32_t const nfedsWithData,
0419                  uint32_t const nbytesTotal) {
0420     // input device buffers
0421     ecal::raw::InputDataDevice inputDevice(queue, nbytesTotal, nfedsWithData);
0422
0423     // transfer the raw data
0424     alpaka::memcpy(queue, inputDevice.data, inputHost.data);
0425     alpaka::memcpy(queue, inputDevice.offsets, inputHost.offsets);
0426     alpaka::memcpy(queue, inputDevice.feds, inputHost.feds);
0427
0428     auto workDiv = cms::alpakatools::make_workdiv<Acc1D>(nfedsWithData, 32);  // 32 channels per block
0429     alpaka::exec<Acc1D>(queue,
0430                         workDiv,
0431                         Kernel_unpack{},
0432                         inputDevice.data.data(),
0433                         inputDevice.offsets.data(),
0434                         inputDevice.feds.data(),
0435                         digisDevEB.view(),
0436                         digisDevEE.view(),
0437                         mapping.const_view(),
0438                         nbytesTotal);
0439   }
0440
0441 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw