Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:20:55

0001 //////////////////////////////////////////////////////////////////////////
0002 //                            Forest.cxx                                //
0003 // =====================================================================//
0004 // This is the object implementation of a forest of decision trees.     //
0005 // We need this to implement gradient boosting.                         //
0006 // References include                                                   //
0007 //    *Elements of Statistical Learning by Hastie,                      //
0008 //     Tibshirani, and Friedman.                                        //
0009 //    *Greedy Function Approximation: A Gradient Boosting Machine.      //
0010 //     Friedman. The Annals of Statistics, Vol. 29, No. 5. Oct 2001.    //
0011 //    *Inductive Learning of Tree-based Regression Models. Luis Torgo.  //
0012 //                                                                      //
0013 //////////////////////////////////////////////////////////////////////////
0014 
0015 ///////////////////////////////////////////////////////////////////////////
0016 // _______________________Includes_______________________________________//
0017 ///////////////////////////////////////////////////////////////////////////
0018 
0019 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Forest.h"
0020 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Utilities.h"
0021 
0022 #include "FWCore/ParameterSet/interface/FileInPath.h"
0023 
0024 #include "TStopwatch.h"
0025 #include "TString.h"
0026 
0027 #include <iostream>
0028 #include <sstream>
0029 #include <algorithm>
0030 #include <iterator>
0031 #include <fstream>
0032 #include <utility>
0033 
0034 using namespace emtf;
0035 
0036 //////////////////////////////////////////////////////////////////////////
0037 // _______________________Constructor(s)________________________________//
0038 //////////////////////////////////////////////////////////////////////////
0039 
0040 Forest::Forest() { events = std::vector<std::vector<Event*>>(1); }
0041 
0042 //////////////////////////////////////////////////////////////////////////
0043 // ----------------------------------------------------------------------
0044 //////////////////////////////////////////////////////////////////////////
0045 
0046 Forest::Forest(std::vector<Event*>& trainingEvents) { setTrainingEvents(trainingEvents); }
0047 
0048 /////////////////////////////////////////////////////////////////////////
0049 // _______________________Destructor____________________________________//
0050 //////////////////////////////////////////////////////////////////////////
0051 
0052 Forest::~Forest() {
0053   // When the forest is destroyed it will delete the trees as well as the
0054   // events from the training and testing sets.
0055   // The user may want the events to remain after they destroy the forest
0056   // this should be changed in future upgrades.
0057 
0058   for (unsigned int i = 0; i < trees.size(); i++) {
0059     if (trees[i])
0060       delete trees[i];
0061   }
0062 }
0063 
0064 Forest::Forest(const Forest& forest) {
0065   transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const Tree* tree) {
0066     return new Tree(*tree);
0067   });
0068 }
0069 
0070 Forest& Forest::operator=(const Forest& forest) {
0071   for (unsigned int i = 0; i < trees.size(); i++) {
0072     if (trees[i])
0073       delete trees[i];
0074   }
0075   trees.resize(0);
0076 
0077   transform(forest.trees.cbegin(), forest.trees.cend(), back_inserter(trees), [](const Tree* tree) {
0078     return new Tree(*tree);
0079   });
0080   return *this;
0081 }
0082 
0083 //////////////////////////////////////////////////////////////////////////
0084 // ______________________Get/Set_Functions______________________________//
0085 //////////////////////////////////////////////////////////////////////////
0086 
0087 void Forest::setTrainingEvents(std::vector<Event*>& trainingEvents) {
0088   // tell the forest which events to use for training
0089 
0090   Event* e = trainingEvents[0];
0091   // Unused variable
0092   // unsigned int numrows = e->data.size();
0093 
0094   // Reset the events matrix.
0095   events = std::vector<std::vector<Event*>>();
0096 
0097   for (unsigned int i = 0; i < e->data.size(); i++) {
0098     events.push_back(trainingEvents);
0099   }
0100 }
0101 
0102 //////////////////////////////////////////////////////////////////////////
0103 // ----------------------------------------------------------------------
0104 //////////////////////////////////////////////////////////////////////////
0105 
0106 // return a copy of the training events
0107 std::vector<Event*> Forest::getTrainingEvents() { return events[0]; }
0108 
0109 //////////////////////////////////////////////////////////////////////////
0110 // ----------------------------------------------------------------------
0111 //////////////////////////////////////////////////////////////////////////
0112 
0113 // return the ith tree
0114 Tree* Forest::getTree(unsigned int i) {
0115   if (/*i>=0 && */ i < trees.size())
0116     return trees[i];
0117   else {
0118     //std::cout << i << "is an invalid input for getTree. Out of range." << std::endl;
0119     return nullptr;
0120   }
0121 }
0122 
0123 //////////////////////////////////////////////////////////////////////////
0124 // ______________________Various_Helpful_Functions______________________//
0125 //////////////////////////////////////////////////////////////////////////
0126 
0127 unsigned int Forest::size() {
0128   // Return the number of trees in the forest.
0129   return trees.size();
0130 }
0131 
0132 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
0133 //*** Need to make a data structure that includes the next few functions ***
0134 //*** pertaining to events. These don't really have much to do with the  ***
0135 //*** forest class.                                                      ***
0136 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
0137 
0138 //////////////////////////////////////////////////////////////////////////
0139 // ----------------------------------------------------------------------
0140 //////////////////////////////////////////////////////////////////////////
0141 
0142 void Forest::listEvents(std::vector<std::vector<Event*>>& e) {
0143   // Simply list the events in each event vector. We have multiple copies
0144   // of the events vector. Each copy is sorted according to a different
0145   // determining variable.
0146   std::cout << std::endl << "Listing Events... " << std::endl;
0147 
0148   for (unsigned int i = 0; i < e.size(); i++) {
0149     std::cout << std::endl << "Variable " << i << " vector contents: " << std::endl;
0150     for (unsigned int j = 0; j < e[i].size(); j++) {
0151       e[i][j]->outputEvent();
0152     }
0153     std::cout << std::endl;
0154   }
0155 }
0156 
0157 //////////////////////////////////////////////////////////////////////////
0158 // ----------------------------------------------------------------------
0159 //////////////////////////////////////////////////////////////////////////
0160 
0161 // We have to initialize Event::sortingIndex outside of a function since
0162 // it is a static member.
0163 int Event::sortingIndex = 1;
0164 
0165 bool compareEvents(Event* e1, Event* e2) {
0166   // Sort the events according to the variable given by the sortingIndex.
0167   return e1->data[Event::sortingIndex] < e2->data[Event::sortingIndex];
0168 }
0169 //////////////////////////////////////////////////////////////////////////
0170 // ----------------------------------------------------------------------
0171 //////////////////////////////////////////////////////////////////////////
0172 
0173 bool compareEventsById(Event* e1, Event* e2) {
0174   // Sort the events by ID. We need this to produce rate plots.
0175   return e1->id < e2->id;
0176 }
0177 //////////////////////////////////////////////////////////////////////////
0178 // ----------------------------------------------------------------------
0179 //////////////////////////////////////////////////////////////////////////
0180 
0181 void Forest::sortEventVectors(std::vector<std::vector<Event*>>& e) {
0182   // When a node chooses the optimum split point and split variable it needs
0183   // the events to be sorted according to the variable it is considering.
0184 
0185   for (unsigned int i = 0; i < e.size(); i++) {
0186     Event::sortingIndex = i;
0187     std::sort(e[i].begin(), e[i].end(), compareEvents);
0188   }
0189 }
0190 
0191 //////////////////////////////////////////////////////////////////////////
0192 // ----------------------------------------------------------------------
0193 //////////////////////////////////////////////////////////////////////////
0194 
0195 void Forest::rankVariables(std::vector<int>& rank) {
0196   // This function ranks the determining variables according to their importance
0197   // in determining the fit. Use a low learning rate for better results.
0198   // Separates completely useless variables from useful ones well,
0199   // but isn't the best at separating variables of similar importance.
0200   // This is calculated using the error reduction on the training set. The function
0201   // should be changed to use the testing set, but this works fine for now.
0202   // I will try to change this in the future.
0203 
0204   // Initialize the vector v, which will store the total error reduction
0205   // for each variable i in v[i].
0206   std::vector<double> v(events.size(), 0);
0207 
0208   //std::cout << std::endl << "Ranking Variables by Net Error Reduction... " << std::endl;
0209 
0210   for (unsigned int j = 0; j < trees.size(); j++) {
0211     trees[j]->rankVariables(v);
0212   }
0213 
0214   double max = *std::max_element(v.begin(), v.end());
0215 
0216   // Scale the importance. Maximum importance = 100.
0217   for (unsigned int i = 0; i < v.size(); i++) {
0218     v[i] = 100 * v[i] / max;
0219   }
0220 
0221   // Change the storage format so that we can keep the index
0222   // and the value associated after sorting.
0223   std::vector<std::pair<double, int>> w(events.size());
0224 
0225   for (unsigned int i = 0; i < v.size(); i++) {
0226     w[i] = std::pair<double, int>(v[i], i);
0227   }
0228 
0229   // Sort so that we can output in order of importance.
0230   std::sort(w.begin(), w.end());
0231 
0232   // Output the results.
0233   for (int i = (v.size() - 1); i >= 0; i--) {
0234     rank.push_back(w[i].second);
0235     // std::cout << "x" << w[i].second  << ": " << w[i].first  << std::endl;
0236   }
0237 
0238   // std::cout << std::endl << "Done." << std::endl << std::endl;
0239 }
0240 
0241 //////////////////////////////////////////////////////////////////////////
0242 // ----------------------------------------------------------------------
0243 //////////////////////////////////////////////////////////////////////////
0244 
0245 void Forest::saveSplitValues(const char* savefilename) {
0246   // This function gathers all of the split values from the forest and puts them into lists.
0247 
0248   std::ofstream splitvaluefile;
0249   splitvaluefile.open(savefilename);
0250 
0251   // Initialize the matrix v, which will store the list of split values
0252   // for each variable i in v[i].
0253   std::vector<std::vector<double>> v(events.size(), std::vector<double>());
0254 
0255   //std::cout << std::endl << "Gathering split values... " << std::endl;
0256 
0257   // Gather the split values from each tree in the forest.
0258   for (unsigned int j = 0; j < trees.size(); j++) {
0259     trees[j]->getSplitValues(v);
0260   }
0261 
0262   // Sort the lists of split values and remove the duplicates.
0263   for (unsigned int i = 0; i < v.size(); i++) {
0264     std::sort(v[i].begin(), v[i].end());
0265     v[i].erase(unique(v[i].begin(), v[i].end()), v[i].end());
0266   }
0267 
0268   // Output the results after removing duplicates.
0269   // The 0th variable is special and is not used for splitting, so we start at 1.
0270   for (unsigned int i = 1; i < v.size(); i++) {
0271     TString splitValues;
0272     for (unsigned int j = 0; j < v[i].size(); j++) {
0273       std::stringstream ss;
0274       ss.precision(14);
0275       ss << std::scientific << v[i][j];
0276       splitValues += ",";
0277       splitValues += ss.str().c_str();
0278     }
0279 
0280     splitValues = splitValues(1, splitValues.Length());
0281     splitvaluefile << splitValues << std::endl << std::endl;
0282     ;
0283   }
0284 }
0285 //////////////////////////////////////////////////////////////////////////
0286 // ______________________Update_Events_After_Fitting____________________//
0287 //////////////////////////////////////////////////////////////////////////
0288 
0289 void Forest::updateRegTargets(Tree* tree, double learningRate, LossFunction* l) {
0290   // Prepare the global vector of events for the next tree.
0291   // Update the fit for each event and set the new target value
0292   // for the next tree.
0293 
0294   // Get the list of terminal nodes for this tree.
0295   std::list<Node*>& tn = tree->getTerminalNodes();
0296 
0297   // Loop through the terminal nodes.
0298   for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) {
0299     // Get the events in the current terminal region.
0300     std::vector<Event*>& v = (*it)->getEvents()[0];
0301 
0302     // Fit the events depending on the loss function criteria.
0303     double fit = l->fit(v);
0304 
0305     // Scale the rate at which the algorithm converges.
0306     fit = learningRate * fit;
0307 
0308     // Store the official fit value in the terminal node.
0309     (*it)->setFitValue(fit);
0310 
0311     // Loop through each event in the terminal region and update the
0312     // the target for the next tree.
0313     for (unsigned int j = 0; j < v.size(); j++) {
0314       Event* e = v[j];
0315       e->predictedValue += fit;
0316       e->data[0] = l->target(e);
0317     }
0318 
0319     // Release memory.
0320     (*it)->getEvents() = std::vector<std::vector<Event*>>();
0321   }
0322 }
0323 
0324 /////////////////////////////////////////////////////////////////////////
0325 // ----------------------------------------------------------------------
0326 /////////////////////////////////////////////////////////////////////////
0327 
0328 void Forest::updateEvents(Tree* tree) {
0329   // Prepare the test events for the next tree.
0330 
0331   // Get the list of terminal nodes for this tree.
0332   std::list<Node*>& tn = tree->getTerminalNodes();
0333 
0334   // Loop through the terminal nodes.
0335   for (std::list<Node*>::iterator it = tn.begin(); it != tn.end(); it++) {
0336     std::vector<Event*>& v = (*it)->getEvents()[0];
0337     double fit = (*it)->getFitValue();
0338 
0339     // Loop through each event in the terminal region and update the
0340     // the global event it maps to.
0341     for (unsigned int j = 0; j < v.size(); j++) {
0342       Event* e = v[j];
0343       e->predictedValue += fit;
0344     }
0345 
0346     // Release memory.
0347     (*it)->getEvents() = std::vector<std::vector<Event*>>();
0348   }
0349 }
0350 
0351 //////////////////////////////////////////////////////////////////////////
0352 // ____________________Do/Test_the Regression___________________________//
0353 //////////////////////////////////////////////////////////////////////////
0354 
0355 void Forest::doRegression(int nodeLimit,
0356                           int treeLimit,
0357                           double learningRate,
0358                           LossFunction* l,
0359                           const char* savetreesdirectory,
0360                           bool saveTrees) {
0361   // Build the forest using the training sample.
0362 
0363   //std::cout << std::endl << "--Building Forest..." << std::endl << std::endl;
0364 
0365   // The trees work with a matrix of events where the rows have the same set of events. Each row however
0366   // is sorted according to the feature variable given by event->data[row].
0367   // If we only had one set of events we would have to sort it according to the
0368   // feature variable every time we want to calculate the best split point for that feature.
0369   // By keeping sorted copies we avoid the sorting operation during splint point calculation
0370   // and save computation time. If we do not sort each of the rows the regression will fail.
0371   //std::cout << "Sorting event vectors..." << std::endl;
0372   sortEventVectors(events);
0373 
0374   // See how long the regression takes.
0375   TStopwatch timer;
0376   timer.Start(kTRUE);
0377 
0378   for (unsigned int i = 0; i < (unsigned)treeLimit; i++) {
0379     // std::cout << "++Building Tree " << i << "... " << std::endl;
0380     Tree* tree = new Tree(events);
0381     trees.push_back(tree);
0382     tree->buildTree(nodeLimit);
0383 
0384     // Update the targets for the next tree to fit.
0385     updateRegTargets(tree, learningRate, l);
0386 
0387     // Save trees to xml in some directory.
0388     std::ostringstream ss;
0389     ss << savetreesdirectory << "/" << i << ".xml";
0390     std::string s = ss.str();
0391     const char* c = s.c_str();
0392 
0393     if (saveTrees)
0394       tree->saveToXML(c);
0395   }
0396   //std::cout << std::endl;
0397   //std::cout << std::endl << "Done." << std::endl << std::endl;
0398 
0399   //    std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
0400 }
0401 
0402 //////////////////////////////////////////////////////////////////////////
0403 // ----------------------------------------------------------------------
0404 //////////////////////////////////////////////////////////////////////////
0405 
0406 void Forest::predictEvents(std::vector<Event*>& eventsp, unsigned int numtrees) {
0407   // Predict values for eventsp by running them through the forest up to numtrees.
0408 
0409   //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
0410   if (numtrees > trees.size()) {
0411     //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
0412     numtrees = trees.size();
0413   }
0414 
0415   // i iterates through the trees in the forest. Each tree corrects the last prediction.
0416   for (unsigned int i = 0; i < numtrees; i++) {
0417     //std::cout << "++Tree " << i << "..." << std::endl;
0418     appendCorrection(eventsp, i);
0419   }
0420 }
0421 
0422 //////////////////////////////////////////////////////////////////////////
0423 // ----------------------------------------------------------------------
0424 //////////////////////////////////////////////////////////////////////////
0425 
0426 void Forest::appendCorrection(std::vector<Event*>& eventsp, int treenum) {
0427   // Update the prediction by appending the next correction.
0428 
0429   Tree* tree = trees[treenum];
0430   tree->filterEvents(eventsp);
0431 
0432   // Update the events with their new prediction.
0433   updateEvents(tree);
0434 }
0435 
0436 //////////////////////////////////////////////////////////////////////////
0437 // ----------------------------------------------------------------------
0438 //////////////////////////////////////////////////////////////////////////
0439 
0440 void Forest::predictEvent(Event* e, unsigned int numtrees) {
0441   // Predict values for eventsp by running them through the forest up to numtrees.
0442 
0443   //std::cout << "Using " << numtrees << " trees from the forest to predict events ... " << std::endl;
0444   if (numtrees > trees.size()) {
0445     //std::cout << std::endl << "!! Input greater than the forest size. Using forest.size() = " << trees.size() << " to predict instead." << std::endl;
0446     numtrees = trees.size();
0447   }
0448 
0449   // just like in line #2470 of https://root.cern.ch/doc/master/MethodBDT_8cxx_source.html for gradient boosting
0450   e->predictedValue = trees[0]->getBoostWeight();
0451 
0452   // i iterates through the trees in the forest. Each tree corrects the last prediction.
0453   for (unsigned int i = 0; i < numtrees; i++) {
0454     //std::cout << "++Tree " << i << "..." << std::endl;
0455     appendCorrection(e, i);
0456   }
0457 }
0458 
0459 //////////////////////////////////////////////////////////////////////////
0460 // ----------------------------------------------------------------------
0461 //////////////////////////////////////////////////////////////////////////
0462 
0463 void Forest::appendCorrection(Event* e, int treenum) {
0464   // Update the prediction by appending the next correction.
0465 
0466   Tree* tree = trees[treenum];
0467   Node* terminalNode = tree->filterEvent(e);
0468 
0469   // Update the event with its new prediction.
0470   double fit = terminalNode->getFitValue();
0471   e->predictedValue += fit;
0472 }
0473 /////////////////////////////////////////////////////////////////////////////////////
0474 // ----------------------------------------------------------------------------------
0475 /////////////////////////////////////////////////////////////////////////////////////
0476 
0477 void Forest::loadForestFromXML(const char* directory, unsigned int numTrees) {
0478   // Load a forest that has already been created and stored into XML somewhere.
0479 
0480   // Initialize the vector of trees.
0481   trees = std::vector<Tree*>(numTrees);
0482 
0483   // Load the Forest.
0484   // std::cout << std::endl << "Loading Forest from XML ... " << std::endl;
0485   for (unsigned int i = 0; i < numTrees; i++) {
0486     trees[i] = new Tree();
0487 
0488     std::stringstream ss;
0489     ss << directory << "/" << i << ".xml";
0490 
0491     trees[i]->loadFromXML(edm::FileInPath(ss.str().c_str()).fullPath().c_str());
0492   }
0493 
0494   //std::cout << "Done." << std::endl << std::endl;
0495 }
0496 
0497 void Forest::loadFromCondPayload(const L1TMuonEndCapForest::DForest& forest) {
0498   // Load a forest that has already been created and stored in CondDB.
0499   // Initialize the vector of trees.
0500   unsigned int numTrees = forest.size();
0501 
0502   // clean-up leftovers from previous initialization (if any)
0503   for (unsigned int i = 0; i < trees.size(); i++) {
0504     if (trees[i])
0505       delete trees[i];
0506   }
0507 
0508   trees = std::vector<Tree*>(numTrees);
0509 
0510   // Load the Forest.
0511   for (unsigned int i = 0; i < numTrees; i++) {
0512     trees[i] = new Tree();
0513     trees[i]->loadFromCondPayload(forest[i]);
0514   }
0515 }
0516 
0517 //////////////////////////////////////////////////////////////////////////
0518 // ___________________Stochastic_Sampling_&_Regression__________________//
0519 //////////////////////////////////////////////////////////////////////////
0520 
0521 void Forest::prepareRandomSubsample(double fraction) {
0522   // We use this for Stochastic Gradient Boosting. Basically you
0523   // take a subsample of the training events and build a tree using
0524   // those. Then use the tree built from the subsample to update
0525   // the predictions for all the events.
0526 
0527   subSample = std::vector<std::vector<Event*>>(events.size());
0528   size_t subSampleSize = fraction * events[0].size();
0529 
0530   // Randomize the first subSampleSize events in events[0].
0531   shuffle(events[0].begin(), events[0].end(), subSampleSize);
0532 
0533   // Get a copy of the random subset we just made.
0534   std::vector<Event*> v(events[0].begin(), events[0].begin() + subSampleSize);
0535 
0536   // Initialize and sort the subSample collection.
0537   for (unsigned int i = 0; i < subSample.size(); i++) {
0538     subSample[i] = v;
0539   }
0540 
0541   sortEventVectors(subSample);
0542 }
0543 
0544 //////////////////////////////////////////////////////////////////////////
0545 // ----------------------------------------------------------------------
0546 //////////////////////////////////////////////////////////////////////////
0547 
0548 void Forest::doStochasticRegression(
0549     int nodeLimit, int treeLimit, double learningRate, double fraction, LossFunction* l) {
0550   // If the fraction of events to use is one then this algorithm is slower than doRegression due to the fact
0551   // that we have to sort the events every time we extract a subsample. Without random sampling we simply
0552   // use all of the events and keep them sorted.
0553 
0554   // Anyways, this algorithm uses a portion of the events to train each tree. All of the events are updated
0555   // afterwards with the results from the subsample built tree.
0556 
0557   // Prepare some things.
0558   sortEventVectors(events);
0559   trees = std::vector<Tree*>(treeLimit);
0560 
0561   // See how long the regression takes.
0562   TStopwatch timer;
0563   timer.Start(kTRUE);
0564 
0565   // Output the current settings.
0566   // std::cout << std::endl << "Running stochastic regression ... " << std::endl;
0567   //std::cout << "# Nodes: " << nodeLimit << std::endl;
0568   //std::cout << "Learning Rate: " << learningRate << std::endl;
0569   //std::cout << "Bagging Fraction: " << fraction << std::endl;
0570   //std::cout << std::endl;
0571 
0572   for (unsigned int i = 0; i < (unsigned)treeLimit; i++) {
0573     // Build the tree using a random subsample.
0574     prepareRandomSubsample(fraction);
0575     trees[i] = new Tree(subSample);
0576     trees[i]->buildTree(nodeLimit);
0577 
0578     // Fit all of the events based upon the tree we built using
0579     // the subsample of events.
0580     trees[i]->filterEvents(events[0]);
0581 
0582     // Update the targets for the next tree to fit.
0583     updateRegTargets(trees[i], learningRate, l);
0584 
0585     // Save trees to xml in some directory.
0586     std::ostringstream ss;
0587     ss << "trees/" << i << ".xml";
0588     std::string s = ss.str();
0589     const char* c = s.c_str();
0590 
0591     trees[i]->saveToXML(c);
0592   }
0593 
0594   //std::cout << std::endl << "Done." << std::endl << std::endl;
0595 
0596   //std::cout << std::endl << "Total calculation time: " << timer.RealTime() << std::endl;
0597 }