Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-11-12 00:00:59

0001 #include "DQM/EcalMonitorClient/interface/MLClient.h"
0002 
0003 #include "DataFormats/EcalDetId/interface/EcalTrigTowerDetId.h"
0004 
0005 #include "CondFormats/EcalObjects/interface/EcalDQMStatusHelper.h"
0006 
0007 #include "DQM/EcalCommon/interface/EcalDQMCommonUtils.h"
0008 
0009 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0010 
0011 #include "PhysicsTools/ONNXRuntime/interface/ONNXRuntime.h"
0012 
0013 #include "DQM/EcalCommon/interface/MESetNonObject.h"
0014 
0015 using namespace cms::Ort;
0016 
0017 namespace ecaldqm {
0018   MLClient::MLClient() : DQWorkerClient() { qualitySummaries_.insert("MLQualitySummary"); }
0019 
0020   void MLClient::setParams(edm::ParameterSet const& _params) {
0021     MLThreshold_ = _params.getUntrackedParameter<double>("MLThreshold");
0022     PUcorr_slope_ = _params.getUntrackedParameter<double>("PUcorr_slope");
0023     PUcorr_intercept_ = _params.getUntrackedParameter<double>("PUcorr_intercept");
0024     avgOcc_ = _params.getUntrackedParameter<std::vector<double>>("avgOcc");
0025     if (!onlineMode_) {
0026       MEs_.erase(std::string("MLQualitySummary"));
0027       MEs_.erase(std::string("EventsperMLImage"));
0028       sources_.erase(std::string("PU"));
0029       sources_.erase(std::string("NumEvents"));
0030       sources_.erase(std::string("DigiAllByLumi"));
0031       sources_.erase(std::string("AELoss"));
0032     }
0033   }
0034 
0035   void MLClient::producePlots(ProcessType) {
0036     if (!onlineMode_)
0037       return;
0038     using namespace std;
0039     MESet& meMLQualitySummary(MEs_.at("MLQualitySummary"));
0040     MESet& meEventsperMLImage(MEs_.at("EventsperMLImage"));
0041 
0042     MESetNonObject const& sPU(static_cast<MESetNonObject&>(sources_.at("PU")));
0043     MESetNonObject const& sNumEvents(static_cast<MESetNonObject&>(sources_.at("NumEvents")));
0044 
0045     //Get the no.of events and the PU per LS calculated in OccupancyTask
0046     int nEv = sNumEvents.getFloatValue();
0047     double pu = sPU.getFloatValue();
0048     //Do not compute ML quality if PU is non existent.
0049     if (pu < 0.) {
0050       return;
0051     }
0052     uint32_t mask(1 << EcalDQMStatusHelper::PEDESTAL_ONLINE_HIGH_GAIN_RMS_ERROR |
0053                   1 << EcalDQMStatusHelper::PHYSICS_BAD_CHANNEL_WARNING |
0054                   1 << EcalDQMStatusHelper::PHYSICS_BAD_CHANNEL_ERROR);
0055 
0056     //////////////// ML Data Preprocessing //////////////////////////////////
0057     //Inorder to feed the data into the ML model we apply some preprocessing.
0058     //We use the Digi Occupancy per Lumisection as the input source.
0059     //The model was trained on each occupancy plot having 500 events.
0060     //In apprehension of the low luminosity in the beginning of Run3, where in online DQM
0061     //the no.of events per LS could be lower than 500, we sum the occupancies over a fixed no.of lumisections as a running sum,
0062     //and require that the total no.of events on this summed occupancy to be atleast 200.
0063     //(This no.of LS and the no.of events are parameters which would require tuning later)
0064     //This summed occupancy is now the input image, which is then corrected for PileUp(PU) dependence and
0065     //change in no.of events, which are derived from training.
0066     //The input image is also padded by replicating the top and bottom rows so as to prevent the "edge effect"
0067     //wherein the ML model's learning degrades near the edge of the data set it sees.
0068     //This padding is then removed during inference on the model output.
0069 
0070     //Get the histogram of the input digi occupancy per lumisection.
0071     TH2F* hEbDigiMap((sources_.at("DigiAllByLumi")).getME(1)->getTH2F());
0072 
0073     size_t nTowers = nEtaTowers * nPhiTowers;  //Each occupancy map is of size 34x72 towers
0074     std::vector<float> ebOccMap1dCumulPad;     //Vector to feed into the ML network
0075     std::valarray<float> ebOccMap1d(nTowers);  //Array to store occupancy map of size 34x72
0076     //Store the values from the input histogram into the array
0077     //to do preprocessing
0078     for (int i = 0; i < hEbDigiMap->GetNbinsY(); i++) {  //NbinsY = 34, NbinsX = 72
0079       for (int j = 0; j < hEbDigiMap->GetNbinsX(); j++) {
0080         int bin = hEbDigiMap->GetBin(j + 1, i + 1);
0081         int k = (i * nPhiTowers) + j;
0082         ebOccMap1d[k] = hEbDigiMap->GetBinContent(bin);
0083       }
0084     }
0085     ebOccMap1dQ.push_back(ebOccMap1d);  //Queue which stores input occupancy maps for nLS lumis
0086     NEventQ.push_back(nEv);             //Queue which stores the no.of events per LS for nLS lumis
0087 
0088     if (NEventQ.size() < nLS) {
0089       return;  //Should have nLS lumis to add the occupancy over.
0090     }
0091     if (NEventQ.size() > nLS) {
0092       NEventQ.pop_front();  //Keep only nLS consecutive LS. Pop the first one if size greater than nLS
0093     }
0094     if (ebOccMap1dQ.size() > nLS) {
0095       ebOccMap1dQ.pop_front();  //Same conditon for the input occupancy maps.
0096     }
0097 
0098     int TNum = 0;
0099     for (size_t i = 0; i < nLS; i++) {
0100       TNum += NEventQ[i];  //Total no.of events over nLS lumis
0101     }
0102     if (TNum < 200) {
0103       return;  //The total no.of events should be atleast 200 over nLS for meaningful statistics
0104     }
0105     //Fill the ME to monitor the trend of the total no.of events in each input image to the ML model
0106     meEventsperMLImage.fill(getEcalDQMSetupObjects(), EcalBarrel, double(timestamp_.iLumi), double(TNum));
0107 
0108     //Array to hold the sum of inputs, which make atleast 200 events.
0109     std::valarray<float> ebOccMap1dCumul(0., nTowers);
0110 
0111     for (size_t i = 0; i < ebOccMap1dQ.size(); i++) {
0112       ebOccMap1dCumul += ebOccMap1dQ[i];  //Sum the input arrays of N LS.
0113     }
0114     //Applying PU correction derived from training
0115     ebOccMap1dCumul = ebOccMap1dCumul / (PUcorr_slope_ * pu + PUcorr_intercept_);
0116 
0117     //Scaling up to match input dimensions. 36*72 used instead of 34*72 to accommodate the additional padding
0118     //of 2 rows to prevent the "edge effect" which is done below
0119     ebOccMap1dCumul = ebOccMap1dCumul * (nEtaTowersPad * nPhiTowers);
0120 
0121     //Correction for no.of events in each input image as originally model trained with 500 events per image
0122     ebOccMap1dCumul = ebOccMap1dCumul * (500. / TNum);
0123 
0124     //The pre-processed input is now fed into the input tensor vector which will go into the ML model
0125     ebOccMap1dCumulPad.assign(std::begin(ebOccMap1dCumul), std::end(ebOccMap1dCumul));
0126 
0127     //Replicate and pad with the first and last row to prevent the edge effect
0128     for (int k = 0; k < nPhiTowers; k++) {
0129       float val = ebOccMap1dCumulPad[nPhiTowers - 1];
0130       ebOccMap1dCumulPad.insert(ebOccMap1dCumulPad.begin(),
0131                                 val);  //padding in the beginning with the first row elements
0132     }
0133 
0134     int size = ebOccMap1dCumulPad.size();
0135     for (int k = (size - nPhiTowers); k < size; k++) {
0136       float val = ebOccMap1dCumulPad[k];
0137       ebOccMap1dCumulPad.push_back(val);  //padding at the end with the last row elements
0138     }
0139 
0140     ///// Model Inference //////
0141     //An Autoencoder (AE) network with resnet architecture is used here which is trained on
0142     //certified good data (EB digi occupancy) from Run 2018 data.
0143     //On giving an input occupancy map, the encoder part of the AE compresses and reduces the input data, learning its features,
0144     //and the decoder reconstructs the data from the encoded form into a representation as close to the original input as possible.
0145     //We then compute the Mean squared error (MSE) between the input and output image, also called the Reconstruction Loss,
0146     //calculated at a tower by tower basis.
0147     //Thus, given an anomalous tower the loss should be significantly higher than the loss with respect to good towers, which the model
0148     //has already seen --> anomaly detection.
0149     //When calculating the loss we also apply a response correction by dividing each input and output image with the average occupancy from
0150     //all 2018 data (also to be tuned),to accommodate the difference in response of crystals in different regions of the Ecal Barrel
0151     //Further each loss map from each input image is then multiplied by the last N loss maps,
0152     ///so that real anomalies which persist with time are enhanced and fluctuations are suppressed.
0153     //A quality threshold is then applied on this time multiplied loss map, to mark them as GOOD or BAD,
0154     //after which it is stored as a quality summary ME.
0155 
0156     ///ONNX model running///
0157     std::string instanceName{"AE-DQM-inference"};
0158     std::string modelFilepath = edm::FileInPath("DQM/EcalMonitorClient/data/onnxModels/resnet.onnx").fullPath();
0159 
0160     Ort::SessionOptions sessionOptions;
0161     sessionOptions.SetIntraOpNumThreads(1);
0162     Ort::Env env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, instanceName.c_str());
0163     Ort::Session session(env, modelFilepath.c_str(), sessionOptions);
0164 
0165     Ort::AllocatorWithDefaultOptions allocator;
0166 
0167     const char* inputName = session.GetInputName(0, allocator);
0168 
0169     Ort::TypeInfo inputTypeInfo = session.GetInputTypeInfo(0);
0170     auto inputTensorInfo = inputTypeInfo.GetTensorTypeAndShapeInfo();
0171 
0172     std::vector<int64_t> inputDims = inputTensorInfo.GetShape();
0173 
0174     const char* outputName = session.GetOutputName(0, allocator);
0175 
0176     Ort::TypeInfo outputTypeInfo = session.GetOutputTypeInfo(0);
0177     auto outputTensorInfo = outputTypeInfo.GetTensorTypeAndShapeInfo();
0178 
0179     std::vector<int64_t> outputDims = outputTensorInfo.GetShape();
0180 
0181     size_t TensorSize = nEtaTowersPad * nPhiTowers;
0182     std::vector<float> ebRecoOccMap1dPad(TensorSize);  //To store the output reconstructed occupancy
0183 
0184     std::vector<const char*> inputNames{inputName};
0185     std::vector<const char*> outputNames{outputName};
0186     std::vector<Ort::Value> inputTensors;
0187     std::vector<Ort::Value> outputTensors;
0188 
0189     Ort::MemoryInfo memoryInfo =
0190         Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
0191     inputTensors.push_back(Ort::Value::CreateTensor<float>(
0192         memoryInfo, ebOccMap1dCumulPad.data(), TensorSize, inputDims.data(), inputDims.size()));
0193 
0194     outputTensors.push_back(Ort::Value::CreateTensor<float>(
0195         memoryInfo, ebRecoOccMap1dPad.data(), TensorSize, outputDims.data(), outputDims.size()));
0196 
0197     session.Run(Ort::RunOptions{nullptr},
0198                 inputNames.data(),
0199                 inputTensors.data(),
0200                 1,
0201                 outputNames.data(),
0202                 outputTensors.data(),
0203                 1);
0204 
0205     ///Inference on the output from the model///
0206     //2D Loss map to store tower by tower loss between the output (reconstructed) and input occupancies,
0207     //Have same dimensions as the occupancy plot
0208     std::valarray<std::valarray<float>> lossMap2d(std::valarray<float>(nPhiTowers), nEtaTowers);
0209 
0210     //1D val arrays to store row wise information corresponding to the reconstructed, input and average occupancies, and loss.
0211     //and to do element wise (tower wise) operations on them to calculate the MSE loss between the reco and input occupancy.
0212     std::valarray<float> recoOcc1d(0., nPhiTowers);
0213     std::valarray<float> inputOcc1d(0., nPhiTowers);
0214     std::valarray<float> avgOcc1d(0., nPhiTowers);
0215     std::valarray<float> loss_;
0216 
0217     //Loss calculation
0218     //Ignore the top and bottom replicated padded rows when doing inference
0219     //by making index i run over (1,35) instead of (0,36)
0220     for (int i = 1; i < 35; i++) {
0221       for (int j = 0; j < nPhiTowers; j++) {
0222         int k = (i * nPhiTowers) + j;
0223         recoOcc1d[j] = ebRecoOccMap1dPad[k];
0224         inputOcc1d[j] = ebOccMap1dCumulPad[k];
0225         avgOcc1d[j] = avgOcc_[k];
0226       }
0227       //Calculate the MSE loss = (output-input)^2, with avg response correction
0228       loss_ = std::pow((recoOcc1d / avgOcc1d - inputOcc1d / avgOcc1d), 2);
0229       lossMap2d[i - 1] = (loss_);
0230     }
0231 
0232     lossMap2dQ.push_back(lossMap2d);  //Store each loss map from the output in the queue
0233     if (lossMap2dQ.size() > nLSloss) {
0234       lossMap2dQ.pop_front();  //Keep exactly nLSloss loss maps to multiply
0235     }
0236     if (lossMap2dQ.size() < nLSloss) {  //Exit if there are not nLSloss loss maps
0237       return;
0238     }
0239     //To hold the final multiplied loss
0240     std::valarray<std::valarray<float>> lossMap2dMult(std::valarray<float>(1., nPhiTowers), nEtaTowers);
0241 
0242     //Multiply together the last nLSloss loss maps
0243     //So that real anomalies which persist with time are enhanced and fluctuations are suppressed.
0244     for (size_t i = 0; i < lossMap2dQ.size(); i++) {
0245       lossMap2dMult *= lossMap2dQ[i];
0246     }
0247 
0248     //Fill the AELoss ME with the values of this time multiplied loss map
0249     MESet const& sAELoss(sources_.at("AELoss"));
0250     TH2F* hLossMap2dMult(sAELoss.getME(1)->getTH2F());
0251     for (int i = 0; i < hLossMap2dMult->GetNbinsY(); i++) {
0252       for (int j = 0; j < hLossMap2dMult->GetNbinsX(); j++) {
0253         int bin_ = hLossMap2dMult->GetBin(j + 1, i + 1);
0254         double content = lossMap2dMult[i][j];
0255         hLossMap2dMult->SetBinContent(bin_, content);
0256       }
0257     }
0258     ///////////////////// ML Quality Summary /////////////////////
0259     //Apply the quality threshold on the time multiplied loss map stored in the ME AELoss
0260     //If anomalous, the tower entry will have a large loss value. If good, the value will be close to zero.
0261 
0262     MESet::const_iterator dAEnd(sAELoss.end(GetElectronicsMap()));
0263     for (MESet::const_iterator dItr(sAELoss.beginChannel(GetElectronicsMap())); dItr != dAEnd;
0264          dItr.toNextChannel(GetElectronicsMap())) {
0265       DetId id(dItr->getId());
0266 
0267       bool doMaskML(meMLQualitySummary.maskMatches(id, mask, statusManager_, GetTrigTowerMap()));
0268 
0269       float entries(dItr->getBinContent());
0270       int quality(doMaskML ? kMGood : kGood);
0271       //If a trigger tower entry is greater than the ML threshold, set it to Bad quality, otherwise Good.
0272       if (entries > MLThreshold_) {
0273         quality = doMaskML ? kMBad : kBad;
0274       }
0275       //Fill the quality summary with the quality of the given tower id.
0276       meMLQualitySummary.setBinContent(getEcalDQMSetupObjects(), id, double(quality));
0277     }  // ML Quality Summary
0278   }    // producePlots()
0279 
0280   DEFINE_ECALDQM_WORKER(MLClient);
0281 }  // namespace ecaldqm