src/bdt/Forest.cc

0001 //////////////////////////////////////////////////////////////////////////
0002 //                            Forest.cxx                                //
0003 // =====================================================================//
0004 // This is the object implementation of a forest of decision trees.     //
0005 // We need this to implement gradient boosting.                         //
0006 // References include                                                   //
0007 //    *Elements of Statistical Learning by Hastie,                      //
0008 //     Tibshirani, and Friedman.                                        //
0009 //    *Greedy Function Approximation: A Gradient Boosting Machine.      //
0010 //     Friedman. The Annals of Statistics, Vol. 29, No. 5. Oct 2001.    //
0011 //    *Inductive Learning of Tree-based Regression Models. Luis Torgo.  //
0012 //                                                                      //
0013 //////////////////////////////////////////////////////////////////////////
0014
0015 ///////////////////////////////////////////////////////////////////////////
0016 // _______________________Includes_______________________________________//
0017 ///////////////////////////////////////////////////////////////////////////
0018
0019 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Forest.h"
0020 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Utilities.h"
0021
0022 #include "FWCore/ParameterSet/interface/FileInPath.h"
0023
0024 #include "TStopwatch.h"
0025 #include "TString.h"
0026
0027 #include <iostream>
0028 #include <sstream>
0029 #include <algorithm>
0030 #include <iterator>
0031 #include <fstream>
0032 #include <utility>
0033
0034 using namespace emtf;
0035
0036 //////////////////////////////////////////////////////////////////////////
0037 // _______________________Constructor(s)________________________________//
0038 //////////////////////////////////////////////////////////////////////////
0039
0040 Forest::Forest() { events = std::vector<std::vector<Event*>>(1); }
0041
0042 //////////////////////////////////////////////////////////////////////////
0043 // ----------------------------------------------------------------------
0044 //////////////////////////////////////////////////////////////////////////
0045
0046 Forest::Forest(std::vector<Event*>& trainingEvents) { setTrainingEvents(trainingEvents); }
0047
0048 Forest::Forest(const Forest& forest) {
0049   transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const std::unique_ptr<Tree>& tree) {
0050     return std::make_unique<Tree>(*tree);
0051   });
0052 }
0053
0054 Forest& Forest::operator=(const Forest& forest) {
0055   trees.resize(0);
0056   trees.reserve(forest.trees.size());
0057   transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const std::unique_ptr<Tree>& tree) {
0058     return std::make_unique<Tree>(*tree);
0059   });
0060   return *this;
0061 }
0062
0063 //////////////////////////////////////////////////////////////////////////
0064 // ______________________Get/Set_Functions______________________________//
0065 //////////////////////////////////////////////////////////////////////////
0066
0067 void Forest::setTrainingEvents(std::vector<Event*>& trainingEvents) {
0068   // tell the forest which events to use for training
0069
0070   Event* e = trainingEvents[0];
0071   // Unused variable
0072   // unsigned int numrows = e->data.size();
0073
0074   // Reset the events matrix.
0075   events = std::vector<std::vector<Event*>>();
0076
0077   for (unsigned int i = 0; i < e->data.size(); i++) {
0078     events.push_back(trainingEvents);
0079   }
0080 }
0081
0082 //////////////////////////////////////////////////////////////////////////
0083 // ----------------------------------------------------------------------
0084 //////////////////////////////////////////////////////////////////////////
0085
0086 // return a copy of the training events
0087 std::vector<Event*> Forest::getTrainingEvents() { return events[0]; }
0088
0089 //////////////////////////////////////////////////////////////////////////
0090 // ----------------------------------------------------------------------
0091 //////////////////////////////////////////////////////////////////////////
0092
0093 // return the ith tree
0094 Tree* Forest::getTree(unsigned int i) {
0095   if (/*i>=0 && */ i < trees.size())
0096     return trees[i].get();
0097   else {
0098     //std::cout << i << "is an invalid input for getTree. Out of range." << std::endl;
0099     return nullptr;
0100   }
0101 }
0102
0103 //////////////////////////////////////////////////////////////////////////
0104 // ______________________Various_Helpful_Functions______________________//
0105 //////////////////////////////////////////////////////////////////////////
0106
0107 unsigned int Forest::size() const {
0108   // Return the number of trees in the forest.
0109   return trees.size();
0110 }
0111
0112 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
0113 //*** Need to make a data structure that includes the next few functions ***
0114 //*** pertaining to events. These don't really have much to do with the  ***
0115 //*** forest class.                                                      ***
0116 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
0117
0118 //////////////////////////////////////////////////////////////////////////
0119 // ----------------------------------------------------------------------
0120 //////////////////////////////////////////////////////////////////////////
0121
0122 void Forest::listEvents(std::vector<std::vector<Event*>>& e) const {
0123   // Simply list the events in each event vector. We have multiple copies
0124   // of the events vector. Each copy is sorted according to a different
0125   // determining variable.
0126   std::cout << std::endl << "Listing Events... " << std::endl;
0127
0128   for (unsigned int i = 0; i < e.size(); i++) {
0129     std::cout << std::endl << "Variable " << i << " vector contents: " << std::endl;
0130     for (unsigned int j = 0; j < e[i].size(); j++) {
0131       e[i][j]->outputEvent();
0132     }
0133     std::cout << std::endl;
0134   }
0135 }
0136
0137 //////////////////////////////////////////////////////////////////////////
0138 // ----------------------------------------------------------------------
0139 //////////////////////////////////////////////////////////////////////////
0140
0141 // We have to initialize Event::sortingIndex outside of a function since
0142 // it is a static member.
0143 int Event::sortingIndex = 1;
0144
0145 bool compareEvents(Event* e1, Event* e2) {
0146   // Sort the events according to the variable given by the sortingIndex.
0147   return e1->data[Event::sortingIndex] < e2->data[Event::sortingIndex];
0148 }
0149 //////////////////////////////////////////////////////////////////////////
0150 // ----------------------------------------------------------------------
0151 //////////////////////////////////////////////////////////////////////////
0152
0153 bool compareEventsById(Event* e1, Event* e2) {
0154   // Sort the events by ID. We need this to produce rate plots.
0155   return e1->id < e2->id;
0156 }
0157 //////////////////////////////////////////////////////////////////////////
0158 // ----------------------------------------------------------------------
0159 //////////////////////////////////////////////////////////////////////////
0160
0161 void Forest::sortEventVectors(std::vector<std::vector<Event*>>& e) const {
0162   // When a node chooses the optimum split point and split variable it needs
0163   // the events to be sorted according to the variable it is considering.
0164
0165   for (unsigned int i = 0; i < e.size(); i++) {
0166     Event::sortingIndex = i;
0167     std::sort(e[i].begin(), e[i].end(), compareEvents);
0168   }
0169 }
0170
0171 //////////////////////////////////////////////////////////////////////////
0172 // ----------------------------------------------------------------------
0173 //////////////////////////////////////////////////////////////////////////
0174
0175 void Forest::rankVariables(std::vector<int>& rank) const {
0176   // This function ranks the determining variables according to their importance
0177   // in determining the fit. Use a low learning rate for better results.
0178   // Separates completely useless variables from useful ones well,
0179   // but isn't the best at separating variables of similar importance.
0180   // This is calculated using the error reduction on the training set. The function
0181   // should be changed to use the testing set, but this works fine for now.
0182   // I will try to change this in the future.
0183
0184   // Initialize the vector v, which will store the total error reduction
0185   // for each variable i in v[i].
0186   std::vector<double> v(events.size(), 0);
0187
0188   //std::cout << std::endl << "Ranking Variables by Net Error Reduction... " << std::endl;
0189
0190   for (unsigned int j = 0; j < trees.size(); j++) {
0191     trees[j]->rankVariables(v);
0192   }
0193
0194   double max = *std::max_element(v.begin(), v.end());
0195
0196   // Scale the importance. Maximum importance = 100.
0197   for (unsigned int i = 0; i < v.size(); i++) {
0198     v[i] = 100 * v[i] / max;
0199   }
0200
0201   // Change the storage format so that we can keep the index
0202   // and the value associated after sorting.
0203   std::vector<std::pair<double, int>> w(events.size());
0204
0205   for (unsigned int i = 0; i < v.size(); i++) {
0206     w[i] = std::pair<double, int>(v[i], i);
0207   }
0208
0209   // Sort so that we can output in order of importance.
0210   std::sort(w.begin(), w.end());
0211
0212   // Output the results.
0213   for (int i = (v.size() - 1); i >= 0; i--) {
0214     rank.push_back(w[i].second);
0215     // std::cout << "x" << w[i].second  << ": " << w[i].first  << std::endl;
0216   }
0217
0218   // std::cout << std::endl << "Done." << std::endl << std::endl;
0219 }
0220
0221 //////////////////////////////////////////////////////////////////////////
0222 // ----------------------------------------------------------------------
0223 //////////////////////////////////////////////////////////////////////////
0224
0225 void Forest::saveSplitValues(const char* savefilename) const {
0226   // This function gathers all of the split values from the forest and puts them into lists.
0227
0228   std::ofstream splitvaluefile;
0229   splitvaluefile.open(savefilename);
0230
0231   // Initialize the matrix v, which will store the list of split values
0232   // for each variable i in v[i].
0233   std::vector<std::vector<double>> v(events.size(), std::vector<double>());
0234
0235   //std::cout << std::endl << "Gathering split values... " << std::endl;
0236
0237   // Gather the split values from each tree in the forest.
0238   for (unsigned int j = 0; j < trees.size(); j++) {
0239     trees[j]->getSplitValues(v);
0240   }
0241
0242   // Sort the lists of split values and remove the duplicates.
0243   for (unsigned int i = 0; i < v.size(); i++) {
0244     std::sort(v[i].begin(), v[i].end());
0245     v[i].erase(unique(v[i].begin(), v[i].end()), v[i].end());
0246   }
0247
0248   // Output the results after removing duplicates.
0249   // The 0th variable is special and is not used for splitting, so we start at 1.
0250   for (unsigned int i = 1; i < v.size(); i++) {
0251     TString splitValues;
0252     for (unsigned int j = 0; j < v[i].size(); j++) {
0253       std::stringstream ss;
0254       ss.precision(14);
0255       ss << std::scientific << v[i][j];
0256       splitValues += ",";
0257       splitValues += ss.str().c_str();
0258     }
0259
0260     splitValues = splitValues(1, splitValues.Length());
0261     splitvaluefile << splitValues << std::endl << std::endl;
0262     ;
0263   }
0264 }
0265 //////////////////////////////////////////////////////////////////////////
0266 // ______________________Update_Events_After_Fitting____________________//
0267 //////////////////////////////////////////////////////////////////////////
0268
0269 void Forest::updateRegTargets(Tree* tree, double learningRate, LossFunction* l) {
0270   // Prepare the global vector of events for the next tree.
0271   // Update the fit for each event and set the new target value
0272   // for the next tree.
0273
0274   // Get the list of terminal nodes for this tree.
0275   std::list<Node*>& tn = tree->getTerminalNodes();
0276
0277   // Loop through the terminal nodes.
0278   for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) {
0279     // Get the events in the current terminal region.
0280     std::vector<Event*>& v = (*it)->getEvents()[0];
0281
0282     // Fit the events depending on the loss function criteria.
0283     double fit = l->fit(v);
0284
0285     // Scale the rate at which the algorithm converges.
0286     fit = learningRate * fit;
0287
0288     // Store the official fit value in the terminal node.
0289     (*it)->setFitValue(fit);
0290
0291     // Loop through each event in the terminal region and update the
0292     // the target for the next tree.
0293     for (unsigned int j = 0; j < v.size(); j++) {
0294       Event* e = v[j];
0295       e->predictedValue += fit;
0296       e->data[0] = l->target(e);
0297     }
0298
0299     // Release memory.
0300     (*it)->getEvents() = std::vector<std::vector<Event*>>();
0301   }
0302 }
0303
0304 /////////////////////////////////////////////////////////////////////////
0305 // ----------------------------------------------------------------------
0306 /////////////////////////////////////////////////////////////////////////
0307
0308 void Forest::updateEvents(Tree* tree) {
0309   // Prepare the test events for the next tree.
0310
0311   // Get the list of terminal nodes for this tree.
0312   std::list<Node*>& tn = tree->getTerminalNodes();
0313
0314   // Loop through the terminal nodes.
0315   for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) {
0316     std::vector<Event*>& v = (*it)->getEvents()[0];
0317     double fit = (*it)->getFitValue();
0318
0319     // Loop through each event in the terminal region and update the
0320     // the global event it maps to.
0321     for (unsigned int j = 0; j < v.size(); j++) {
0322       Event* e = v[j];
0323       e->predictedValue += fit;
0324     }
0325
0326     // Release memory.
0327     (*it)->getEvents() = std::vector<std::vector<Event*>>();
0328   }
0329 }
0330
0331 //////////////////////////////////////////////////////////////////////////
0332 // ____________________Do/Test_the Regression___________________________//
0333 //////////////////////////////////////////////////////////////////////////
0334
0335 void Forest::doRegression(int nodeLimit,
0336                           int treeLimit,
0337                           double learningRate,
0338                           LossFunction* l,
0339                           const char* savetreesdirectory,
0340                           bool saveTrees) {
0341   // Build the forest using the training sample.
0342
0343   //std::cout << std::endl << "--Building Forest..." << std::endl << std::endl;
0344
0345   // The trees work with a matrix of events where the rows have the same set of events. Each row however
0346   // is sorted according to the feature variable given by event->data[row].
0347   // If we only had one set of events we would have to sort it according to the
0348   // feature variable every time we want to calculate the best split point for that feature.
0349   // By keeping sorted copies we avoid the sorting operation during splint point calculation
0350   // and save computation time. If we do not sort each of the rows the regression will fail.
0351   //std::cout << "Sorting event vectors..." << std::endl;
0352   sortEventVectors(events);
0353
0354   // See how long the regression takes.
0355   TStopwatch timer;
0356   timer.Start(kTRUE);
0357
0358   for (unsigned int i = 0; i < (unsigned)treeLimit; i++) {
0359     // std::cout << "++Building Tree " << i << "... " << std::endl;
0360     trees.emplace_back(std::make_unique<Tree>(events));
0361     Tree* tree = trees.back().get();
0362     tree->buildTree(nodeLimit);
0363
0364     // Update the targets for the next tree to fit.
0365     updateRegTargets(tree, learningRate, l);
0366
0367     // Save trees to xml in some directory.
0368     std::ostringstream ss;
0369     ss << savetreesdirectory << "/" << i << ".xml";
0370     std::string s = ss.str();
0371     const char* c = s.c_str();
0372
0373     if (saveTrees)
0374       tree->saveToXML(c);
0375   }
0376   //std::cout << std::endl;
0377   //std::cout << std::endl << "Done." << std::endl << std::endl;
0378
0379   //    std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
0380 }
0381
0382 //////////////////////////////////////////////////////////////////////////
0383 // ----------------------------------------------------------------------
0384 //////////////////////////////////////////////////////////////////////////
0385
0386 void Forest::predictEvents(std::vector<Event*>& eventsp, unsigned int numtrees) {
0387   // Predict values for eventsp by running them through the forest up to numtrees.
0388
0389   //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
0390   if (numtrees > trees.size()) {
0391     //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
0392     numtrees = trees.size();
0393   }
0394
0395   // i iterates through the trees in the forest. Each tree corrects the last prediction.
0396   for (unsigned int i = 0; i < numtrees; i++) {
0397     //std::cout << "++Tree " << i << "..." << std::endl;
0398     appendCorrection(eventsp, i);
0399   }
0400 }
0401
0402 //////////////////////////////////////////////////////////////////////////
0403 // ----------------------------------------------------------------------
0404 //////////////////////////////////////////////////////////////////////////
0405
0406 void Forest::appendCorrection(std::vector<Event*>& eventsp, int treenum) {
0407   // Update the prediction by appending the next correction.
0408
0409   Tree* tree = trees[treenum].get();
0410   tree->filterEvents(eventsp);
0411
0412   // Update the events with their new prediction.
0413   updateEvents(tree);
0414 }
0415
0416 //////////////////////////////////////////////////////////////////////////
0417 // ----------------------------------------------------------------------
0418 //////////////////////////////////////////////////////////////////////////
0419
0420 void Forest::predictEvent(Event* e, unsigned int numtrees) {
0421   // Predict values for eventsp by running them through the forest up to numtrees.
0422
0423   //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
0424   if (numtrees > trees.size()) {
0425     //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
0426     numtrees = trees.size();
0427   }
0428
0429   // just like in line #2470 of https://root.cern.ch/doc/master/MethodBDT_8cxx_source.html for gradient boosting
0430   e->predictedValue = trees[0]->getBoostWeight();
0431
0432   // i iterates through the trees in the forest. Each tree corrects the last prediction.
0433   for (unsigned int i = 0; i < numtrees; i++) {
0434     //std::cout << "++Tree " << i << "..." << std::endl;
0435     appendCorrection(e, i);
0436   }
0437 }
0438
0439 //////////////////////////////////////////////////////////////////////////
0440 // ----------------------------------------------------------------------
0441 //////////////////////////////////////////////////////////////////////////
0442
0443 void Forest::appendCorrection(Event* e, int treenum) {
0444   // Update the prediction by appending the next correction.
0445
0446   Tree* tree = trees[treenum].get();
0447   Node* terminalNode = tree->filterEvent(e);
0448
0449   // Update the event with its new prediction.
0450   double fit = terminalNode->getFitValue();
0451   e->predictedValue += fit;
0452 }
0453 /////////////////////////////////////////////////////////////////////////////////////
0454 // ----------------------------------------------------------------------------------
0455 /////////////////////////////////////////////////////////////////////////////////////
0456
0457 void Forest::loadForestFromXML(const char* directory, unsigned int numTrees) {
0458   // Load a forest that has already been created and stored into XML somewhere.
0459
0460   // Initialize the vector of trees.
0461   trees.resize(numTrees);
0462   trees.shrink_to_fit();
0463
0464   // Load the Forest.
0465   // std::cout << std::endl << "Loading Forest from XML ... " << std::endl;
0466   for (unsigned int i = 0; i < numTrees; i++) {
0467     trees[i] = std::make_unique<Tree>();
0468
0469     std::stringstream ss;
0470     ss << directory << "/" << i << ".xml";
0471
0472     trees[i]->loadFromXML(edm::FileInPath(ss.str().c_str()).fullPath().c_str());
0473   }
0474
0475   //std::cout << "Done." << std::endl << std::endl;
0476 }
0477
0478 void Forest::loadFromCondPayload(const L1TMuonEndCapForest::DForest& forest) {
0479   // Load a forest that has already been created and stored in CondDB.
0480   // Initialize the vector of trees.
0481   unsigned int numTrees = forest.size();
0482
0483   trees.resize(numTrees);
0484   trees.shrink_to_fit();
0485
0486   // Load the Forest.
0487   for (unsigned int i = 0; i < numTrees; i++) {
0488     trees[i] = std::make_unique<Tree>();
0489     trees[i]->loadFromCondPayload(forest[i]);
0490   }
0491 }
0492
0493 //////////////////////////////////////////////////////////////////////////
0494 // ___________________Stochastic_Sampling_&_Regression__________________//
0495 //////////////////////////////////////////////////////////////////////////
0496
0497 void Forest::prepareRandomSubsample(double fraction) {
0498   // We use this for Stochastic Gradient Boosting. Basically you
0499   // take a subsample of the training events and build a tree using
0500   // those. Then use the tree built from the subsample to update
0501   // the predictions for all the events.
0502
0503   subSample = std::vector<std::vector<Event*>>(events.size());
0504   size_t subSampleSize = fraction * events[0].size();
0505
0506   // Randomize the first subSampleSize events in events[0].
0507   shuffle(events[0].begin(), events[0].end(), subSampleSize);
0508
0509   // Get a copy of the random subset we just made.
0510   std::vector<Event*> v(events[0].begin(), events[0].begin() + subSampleSize);
0511
0512   // Initialize and sort the subSample collection.
0513   for (unsigned int i = 0; i < subSample.size(); i++) {
0514     subSample[i] = v;
0515   }
0516
0517   sortEventVectors(subSample);
0518 }
0519
0520 //////////////////////////////////////////////////////////////////////////
0521 // ----------------------------------------------------------------------
0522 //////////////////////////////////////////////////////////////////////////
0523
0524 void Forest::doStochasticRegression(
0525     int nodeLimit, int treeLimit, double learningRate, double fraction, LossFunction* l) {
0526   // If the fraction of events to use is one then this algorithm is slower than doRegression due to the fact
0527   // that we have to sort the events every time we extract a subsample. Without random sampling we simply
0528   // use all of the events and keep them sorted.
0529
0530   // Anyways, this algorithm uses a portion of the events to train each tree. All of the events are updated
0531   // afterwards with the results from the subsample built tree.
0532
0533   // Prepare some things.
0534   sortEventVectors(events);
0535   trees.resize(treeLimit);
0536   trees.shrink_to_fit();
0537
0538   // See how long the regression takes.
0539   TStopwatch timer;
0540   timer.Start(kTRUE);
0541
0542   // Output the current settings.
0543   // std::cout << std::endl << "Running stochastic regression ... " << std::endl;
0544   //std::cout << "# Nodes: " << nodeLimit << std::endl;
0545   //std::cout << "Learning Rate: " << learningRate << std::endl;
0546   //std::cout << "Bagging Fraction: " << fraction << std::endl;
0547   //std::cout << std::endl;
0548
0549   for (unsigned int i = 0; i < (unsigned)treeLimit; i++) {
0550     // Build the tree using a random subsample.
0551     prepareRandomSubsample(fraction);
0552     trees[i] = std::make_unique<Tree>(subSample);
0553     trees[i]->buildTree(nodeLimit);
0554
0555     // Fit all of the events based upon the tree we built using
0556     // the subsample of events.
0557     trees[i]->filterEvents(events[0]);
0558
0559     // Update the targets for the next tree to fit.
0560     updateRegTargets(trees[i].get(), learningRate, l);
0561
0562     // Save trees to xml in some directory.
0563     std::ostringstream ss;
0564     ss << "trees/" << i << ".xml";
0565     std::string s = ss.str();
0566     const char* c = s.c_str();
0567
0568     trees[i]->saveToXML(c);
0569   }
0570
0571   //std::cout << std::endl << "Done." << std::endl << std::endl;
0572
0573   //std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
0574 }