src/bdt/Node.cc

0001 //////////////////////////////////////////////////////////////////////////
0002 //                            Node.cxx                                  //
0003 // =====================================================================//
0004 // This is the object implementation of a node, which is the            //
0005 // fundamental unit of a decision tree.                                 //
0006 // References include                                                   //
0007 //    *Elements of Statistical Learning by Hastie,                      //
0008 //     Tibshirani, and Friedman.                                        //
0009 //    *Greedy Function Approximation: A Gradient Boosting Machine.      //
0010 //     Friedman. The Annals of Statistics, Vol. 29, No. 5. Oct 2001.    //
0011 //    *Inductive Learning of Tree-based Regression Models. Luis Torgo.  //
0012 //                                                                      //
0013 //////////////////////////////////////////////////////////////////////////
0014
0015 ///////////////////////////////////////////////////////////////////////////
0016 // _______________________Includes_______________________________________//
0017 ///////////////////////////////////////////////////////////////////////////
0018
0019 #include "L1Trigger/L1TMuonEndCap/interface/bdt/Node.h"
0020
0021 #include "TRandom3.h"
0022 #include "TStopwatch.h"
0023 #include <iostream>
0024 #include <fstream>
0025
0026 //////////////////////////////////////////////////////////////////////////
0027 // _______________________Constructor(s)________________________________//
0028 //////////////////////////////////////////////////////////////////////////
0029
0030 using namespace emtf;
0031
0032 Node::Node() {
0033   name = "";
0034   leftDaughter = nullptr;
0035   rightDaughter = nullptr;
0036   parent = nullptr;
0037   splitValue = -99;
0038   splitVariable = -1;
0039   avgError = -1;
0040   totalError = -1;
0041   errorReduction = -1;
0042 }
0043
0044 Node::Node(std::string cName) {
0045   name = cName;
0046   leftDaughter = nullptr;
0047   rightDaughter = nullptr;
0048   parent = nullptr;
0049   splitValue = -99;
0050   splitVariable = -1;
0051   avgError = -1;
0052   totalError = -1;
0053   errorReduction = -1;
0054 }
0055
0056 //////////////////////////////////////////////////////////////////////////
0057 // ______________________Get/Set________________________________________//
0058 //////////////////////////////////////////////////////////////////////////
0059
0060 void Node::setName(std::string sName) { name = sName; }
0061
0062 std::string Node::getName() const { return name; }
0063
0064 // ----------------------------------------------------------------------
0065
0066 void Node::setErrorReduction(double sErrorReduction) { errorReduction = sErrorReduction; }
0067
0068 double Node::getErrorReduction() const { return errorReduction; }
0069
0070 // ----------------------------------------------------------------------
0071
0072 void Node::setLeftDaughter(std::unique_ptr<Node> sLeftDaughter) { leftDaughter = std::move(sLeftDaughter); }
0073
0074 Node* Node::getLeftDaughter() { return leftDaughter.get(); }
0075 const Node* Node::getLeftDaughter() const { return leftDaughter.get(); }
0076
0077 void Node::setRightDaughter(std::unique_ptr<Node> sRightDaughter) { rightDaughter = std::move(sRightDaughter); }
0078
0079 Node* Node::getRightDaughter() { return rightDaughter.get(); }
0080 const Node* Node::getRightDaughter() const { return rightDaughter.get(); }
0081
0082 // ----------------------------------------------------------------------
0083
0084 void Node::setParent(Node* sParent) { parent = sParent; }
0085
0086 Node* Node::getParent() { return parent; }
0087 const Node* Node::getParent() const { return parent; }
0088
0089 // ----------------------------------------------------------------------
0090
0091 void Node::setSplitValue(double sSplitValue) { splitValue = sSplitValue; }
0092
0093 double Node::getSplitValue() const { return splitValue; }
0094
0095 void Node::setSplitVariable(int sSplitVar) { splitVariable = sSplitVar; }
0096
0097 int Node::getSplitVariable() const { return splitVariable; }
0098
0099 // ----------------------------------------------------------------------
0100
0101 void Node::setFitValue(double sFitValue) { fitValue = sFitValue; }
0102
0103 double Node::getFitValue() const { return fitValue; }
0104
0105 // ----------------------------------------------------------------------
0106
0107 void Node::setTotalError(double sTotalError) { totalError = sTotalError; }
0108
0109 double Node::getTotalError() const { return totalError; }
0110
0111 void Node::setAvgError(double sAvgError) { avgError = sAvgError; }
0112
0113 double Node::getAvgError() const { return avgError; }
0114
0115 // ----------------------------------------------------------------------
0116
0117 void Node::setNumEvents(int sNumEvents) { numEvents = sNumEvents; }
0118
0119 int Node::getNumEvents() const { return numEvents; }
0120
0121 // ----------------------------------------------------------------------
0122
0123 std::vector<std::vector<Event*> >& Node::getEvents() { return events; }
0124
0125 void Node::setEvents(std::vector<std::vector<Event*> >& sEvents) {
0126   events = sEvents;
0127   numEvents = events[0].size();
0128 }
0129
0130 ///////////////////////////////////////////////////////////////////////////
0131 // ______________________Performace_Functions___________________________//
0132 //////////////////////////////////////////////////////////////////////////
0133
0134 void Node::calcOptimumSplit() {
0135   // Determines the split variable and split point which would most reduce the error for the given node (region).
0136   // In the process we calculate the fitValue and Error. The general aglorithm is based upon  Luis Torgo's thesis.
0137   // Check out the reference for a more in depth outline. This part is chapter 3.
0138
0139   // Intialize some variables.
0140   double bestSplitValue = 0;
0141   int bestSplitVariable = -1;
0142   double bestErrorReduction = -1;
0143
0144   double SUM = 0;
0145   double SSUM = 0;
0146   numEvents = events[0].size();
0147
0148   double candidateErrorReduction = -1;
0149
0150   // Calculate the sum of the target variables and the sum of
0151   // the target variables squared. We use these later.
0152   for (unsigned int i = 0; i < events[0].size(); i++) {
0153     double target = events[0][i]->data[0];
0154     SUM += target;
0155     SSUM += target * target;
0156   }
0157
0158   unsigned int numVars = events.size();
0159
0160   // Calculate the best split point for each variable
0161   for (unsigned int variableToCheck = 1; variableToCheck < numVars; variableToCheck++) {
0162     // The sum of the target variables in the left, right nodes
0163     double SUMleft = 0;
0164     double SUMright = SUM;
0165
0166     // The number of events in the left, right nodes
0167     int nleft = 1;
0168     int nright = events[variableToCheck].size() - 1;
0169
0170     int candidateSplitVariable = variableToCheck;
0171
0172     std::vector<Event*>& v = events[variableToCheck];
0173
0174     // Find the best split point for this variable
0175     for (unsigned int i = 1; i < v.size(); i++) {
0176       // As the candidate split point interates, the number of events in the
0177       // left/right node increases/decreases and SUMleft/right increases/decreases.
0178
0179       SUMleft = SUMleft + v[i - 1]->data[0];
0180       SUMright = SUMright - v[i - 1]->data[0];
0181
0182       // No need to check the split point if x on both sides is equal
0183       if (v[i - 1]->data[candidateSplitVariable] < v[i]->data[candidateSplitVariable]) {
0184         // Finding the maximum error reduction for Least Squares boils down to maximizing
0185         // the following statement.
0186         candidateErrorReduction = SUMleft * SUMleft / nleft + SUMright * SUMright / nright - SUM * SUM / numEvents;
0187         //                std::cout << "candidateErrorReduction= " << candidateErrorReduction << std::endl << std::endl;
0188
0189         // if the new candidate is better than the current best, then we have a new overall best.
0190         if (candidateErrorReduction > bestErrorReduction) {
0191           bestErrorReduction = candidateErrorReduction;
0192           bestSplitValue = (v[i - 1]->data[candidateSplitVariable] + v[i]->data[candidateSplitVariable]) / 2;
0193           bestSplitVariable = candidateSplitVariable;
0194         }
0195       }
0196
0197       nright = nright - 1;
0198       nleft = nleft + 1;
0199     }
0200   }
0201
0202   // Store the information gained from our computations.
0203
0204   // The fit value is the average for least squares.
0205   fitValue = SUM / numEvents;
0206   //    std::cout << "fitValue= " << fitValue << std::endl;
0207
0208   // n*[ <y^2>-k^2 ]
0209   totalError = SSUM - SUM * SUM / numEvents;
0210   //    std::cout << "totalError= " << totalError << std::endl;
0211
0212   // [ <y^2>-k^2 ]
0213   avgError = totalError / numEvents;
0214   //    std::cout << "avgError= " << avgError << std::endl;
0215
0216   errorReduction = bestErrorReduction;
0217   //    std::cout << "errorReduction= " << errorReduction << std::endl;
0218
0219   splitVariable = bestSplitVariable;
0220   //    std::cout << "splitVariable= " << splitVariable << std::endl;
0221
0222   splitValue = bestSplitValue;
0223   //    std::cout << "splitValue= " << splitValue << std::endl;
0224
0225   //if(bestSplitVariable == -1) std::cout << "splitVar = -1. numEvents = " << numEvents << ". errRed = " << errorReduction << std::endl;
0226 }
0227
0228 // ----------------------------------------------------------------------
0229
0230 void Node::listEvents() {
0231   std::cout << std::endl << "Listing Events... " << std::endl;
0232
0233   for (unsigned int i = 0; i < events.size(); i++) {
0234     std::cout << std::endl << "Variable " << i << " vector contents: " << std::endl;
0235     for (unsigned int j = 0; j < events[i].size(); j++) {
0236       events[i][j]->outputEvent();
0237     }
0238     std::cout << std::endl;
0239   }
0240 }
0241
0242 // ----------------------------------------------------------------------
0243
0244 void Node::theMiracleOfChildBirth() {
0245   // Create Daughter Nodes
0246   leftDaughter = std::make_unique<Node>(name + " left");
0247   rightDaughter = std::make_unique<Node>(name + " right");
0248
0249   // Link the Nodes Appropriately
0250   leftDaughter->setParent(this);
0251   rightDaughter->setParent(this);
0252 }
0253
0254 // ----------------------------------------------------------------------
0255
0256 void Node::filterEventsToDaughters() {
0257   // Keeping sorted copies of the event vectors allows us to save on
0258   // computation time. That way we don't have to resort the events
0259   // each time we calculate the splitpoint for a node. We sort them once.
0260   // Every time we split a node, we simply filter them down correctly
0261   // preserving the order. This way we have O(n) efficiency instead
0262   // of O(nlogn) efficiency.
0263
0264   // Anyways, this function takes events from the parent node
0265   // and filters an event into the left or right daughter
0266   // node depending on whether it is < or > the split point
0267   // for the given split variable.
0268
0269   unsigned int sv = splitVariable;
0270   double sp = splitValue;
0271
0272   Node* left = leftDaughter.get();
0273   Node* right = rightDaughter.get();
0274
0275   std::vector<std::vector<Event*> > l(events.size());
0276   std::vector<std::vector<Event*> > r(events.size());
0277
0278   for (unsigned int i = 0; i < events.size(); i++) {
0279     for (unsigned int j = 0; j < events[i].size(); j++) {
0280       Event* e = events[i][j];
0281       // Prevent out-of-bounds access
0282       if (sv >= e->data.size())
0283         continue;
0284       if (e->data[sv] < sp)
0285         l[i].push_back(e);
0286       if (e->data[sv] > sp)
0287         r[i].push_back(e);
0288     }
0289   }
0290
0291   events = std::vector<std::vector<Event*> >();
0292
0293   left->getEvents().swap(l);
0294   right->getEvents().swap(r);
0295
0296   // Set the number of events in the node.
0297   left->setNumEvents(left->getEvents()[0].size());
0298   right->setNumEvents(right->getEvents()[0].size());
0299 }
0300
0301 // ----------------------------------------------------------------------
0302
0303 Node* Node::filterEventToDaughter(Event* e) {
0304   // Anyways, this function takes an event from the parent node
0305   // and filters an event into the left or right daughter
0306   // node depending on whether it is < or > the split point
0307   // for the given split variable.
0308
0309   unsigned int sv = splitVariable;
0310   double sp = splitValue;
0311
0312   Node* left = leftDaughter.get();
0313   Node* right = rightDaughter.get();
0314   Node* nextNode = nullptr;
0315
0316   // Prevent out-of-bounds access
0317   if (left == nullptr || right == nullptr || sv >= e->data.size())
0318     return nullptr;
0319
0320   if (e->data[sv] < sp)
0321     nextNode = left;
0322   if (e->data[sv] >= sp)
0323     nextNode = right;
0324
0325   return nextNode;
0326 }