HcalCalibAlgos/macros/isotrackTrainRegressor.py

0001 ######################################################################################
0002 # Trains regressor and saves model for evaluation
0003 # Usage:
0004 # python3 isotrackTrainRegressor.py -I isotk_relval_hi.pkl  -V 1
0005 # python3 isotrackTrainRegressor.py -I isotk_relval_lo.pkl  -V 2
0006 ######################################################################################
0007
0008
0009 import pandas as pd
0010 import numpy as np
0011 import matplotlib.pyplot as plt
0012 import argparse
0013 import tensorflow.keras
0014 from tensorflow.keras.models import Sequential
0015 from tensorflow.keras.layers import Dense
0016 from tensorflow.keras.layers import Dropout
0017 from tensorflow.keras.utils import to_categorical
0018 from tensorflow.keras.utils import plot_model
0019 from tensorflow.keras import regularizers
0020 from sklearn.metrics import roc_curve, auc
0021 from tensorflow.keras.layers import Activation
0022 from tensorflow.keras import backend as K
0023 from tensorflow.keras.models import save_model
0024
0025
0026
0027 parser = argparse.ArgumentParser()
0028 parser.add_argument("-I", "--input",help="input file",default="isotk_relval_hi.pkl")
0029 parser.add_argument("-V", "--modelv",help="model version (any number) ",default="1")
0030
0031
0032
0033 fName = parser.parse_args().input
0034 modv = parser.parse_args().modelv
0035 # In[2]:
0036
0037
0038 df = pd.read_pickle(fName)
0039 print ("vars in file:",df.keys())
0040 print("events in df original:",df.shape[0])
0041 df = df.loc[df['t_eHcal_y'] > 20]
0042 print("events in df after energy cut:",df.shape[0])
0043 df['t_eHcal_xun'] = df['t_eHcal_x']
0044 df['t_delta_un'] = df['t_delta']
0045 df['t_ieta_un'] = df['t_ieta']
0046
0047 mina = []
0048 maxa = []
0049 keya = []
0050
0051
0052 for i in df.keys():
0053     keya.append(i)
0054     mina.append(df[i].min())
0055     maxa.append(df[i].max())
0056
0057 print('var = ',keya)
0058 print('mina = ',mina)
0059 print('maxa = ',maxa)
0060
0061
0062 #cols_to_stand = ['t_nVtx','t_ieta','t_eHcal10', 't_eHcal30','t_rhoh','t_eHcal_x']
0063 #cols_to_minmax = ['t_delta', 't_hmaxNearP','t_emaxNearP', 't_hAnnular', 't_eAnnular','t_pt']
0064 cols_to_minmax = ['t_delta', 't_hmaxNearP','t_emaxNearP', 't_hAnnular', 't_eAnnular','t_pt','t_nVtx','t_ieta','t_eHcal10', 't_eHcal30','t_rhoh','t_eHcal_x']
0065 #df[cols_to_stand] = df[cols_to_stand].apply(lambda x: (x - x.mean()) /(x.std()))
0066 #df[cols_to_minmax] = df[cols_to_minmax].apply(lambda x: (x - x.mean()) /  (x.max() - x.min()))
0067 #                                            #(x.max() - x.min()))
0068 df[cols_to_minmax] = df[cols_to_minmax].apply(lambda x: (x - x.min()) /  (x.max() - x.min()))
0069
0070 data = df.values
0071 print ('data shape:',data.shape)
0072 targets = data[:,12]
0073 targets.shape
0074
0075
0076 # In[3]:
0077
0078
0079 data = df.values
0080 ntest = 20000
0081 testindx = data.shape[0] - ntest
0082 X_train = data[:testindx,0:12]
0083 Y_train = data[:testindx,12]
0084 X_test = data[testindx:,:]
0085 print ("shape of X_train:",X_train.shape)
0086 print ("shape of Y_train:",Y_train.shape)
0087 print ("shape of X_test:",X_test.shape)
0088 meany = np.mean(Y_train)
0089 print ("mean y:",meany)
0090 stdy = np.std(Y_train)
0091 print ("std y:", stdy)
0092
0093
0094 # In[4]:
0095
0096
0097 ############################################# marinas correction  form
0098 a0 = [0.973, 0.998,  0.992,  0.965 ]
0099 a1 =  [0,    -0.318, -0.261, -0.406]
0100 a2 = [0,     0,      0.047,  0.089]
0101 def fac0(jeta):
0102     PU_IETA_1 = 9
0103     PU_IETA_2 = 16
0104     PU_IETA_3 = 25
0105     idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0106     return a0[idx]
0107 def fac1(jeta):
0108     PU_IETA_1 = 9
0109     PU_IETA_2 = 16
0110     PU_IETA_3 = 25
0111     idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0112     return a1[idx]
0113 def fac2(jeta):
0114     PU_IETA_1 = 9
0115     PU_IETA_2 = 16
0116     PU_IETA_3 = 25
0117     idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0118     return a2[idx]
0119
0120 vec0 = np.vectorize(fac0)
0121 vec1 = np.vectorize(fac1)
0122 vec2 = np.vectorize(fac2)
0123
0124 X_test[:,17]
0125 eop = (X_test[:,15]/X_test[:,13])
0126 dop = (X_test[:,16]/X_test[:,13])
0127 #mcorrval = vec0(abs(X_test[:,17])) + vec1(abs(X_test[:,17]))*(X_test[:,15]/X_test[:,13])*(X_test[:,16]/X_test[:,13])*( 1 + vec2(fac(abs(X_test[:,17])))*(X_test[:,16]/X_test[:,13]))
0128
0129 mcorrval = vec0(abs(X_test[:,17]))   +  vec1(abs(X_test[:,17]))*eop*dop*( 1   +  vec2(abs(X_test[:,17]))*dop)
0130
0131
0132 # In[5]:
0133
0134
0135 def propweights(y_true):
0136     weight = np.copy(y_true)
0137     weight[abs(y_true - meany) > 0] = 1.90*abs(y_true - meany)/stdy  #1.25
0138 #    weight[abs(y_true - meany) > stdy] = 1.75*abs((weight[abs(y_true - meany) > stdy]) - meany)/(stdy)
0139     weight[abs(y_true - meany) < stdy] =  1
0140     return weight
0141
0142
0143 # In[6]:
0144
0145
0146 from tensorflow.keras import optimizers
0147 print ("creating model=========>")
0148 model = Sequential()
0149 model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
0150 model.add(Dropout(0.2))
0151 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0152 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0153 model.add(Dense(60, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0154 model.add(Dense(1))
0155
0156 RMS = tensorflow.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
0157 # Compile model
0158 print ("compilation up next=======>")
0159 model.compile(loss='logcosh',optimizer='adam')
0160 model.summary()
0161 #fitting
0162 print ("fitting now=========>")
0163 history = model.fit(X_train,Y_train , batch_size=5000, epochs=100, validation_split=0.2, verbose=1,sample_weight=propweights(Y_train))
0164
0165
0166 # In[7]:
0167 plt.plot(history.history['loss'])
0168 plt.plot(history.history['val_loss'])
0169 plt.title('model loss')
0170 plt.ylabel('loss')
0171 plt.xlabel('epoch')
0172 plt.legend(['train', 'validation'], loc='upper left')
0173 plt.savefig(modv+'_loss_distributions.png')
0174 plt.close()
0175
0176
0177 preds = model.predict(X_test[:,0:12])
0178 targets = X_test[:,12]
0179 uncorrected = X_test[:,15]
0180 marinascorr = X_test[:,15]*mcorrval
0181 plt.hist(preds, bins =100, range=(0,200),label='PU regression',alpha=0.6)
0182 plt.hist(targets, bins =100, range=(0,200),label='no PU',alpha=0.6)
0183 plt.hist(uncorrected, bins =100, range=(0,200),label='uncorrected',alpha=0.6)
0184 #plt.hist(marinascorr, bins =100, range=(0,200),label='marinas correction',alpha=0.6)
0185 plt.yscale('log')
0186 plt.title("Energy distribution")
0187 plt.legend(loc='upper right')
0188 plt.savefig(modv+'_energy_distributions.png')
0189 plt.close()
0190
0191 preds = preds.reshape(preds.shape[0])
0192 print (preds.shape)
0193
0194
0195
0196 #plt.hist(preds, bins =100, range=(0,100),label='PU regression',alpha=0.6)
0197 #plt.hist(abs(preds -targets)/targets , bins =100, range=(0,40),label='PU regression',alpha=0.6)
0198 #plt.hist(targets, bins =100, range=(0,100),label='no PU',alpha=0.6)
0199 plt.hist(abs(uncorrected -targets)/targets*100, bins =100, range=(0,100),label='uncorrected',alpha=0.6)
0200 #plt.hist(abs(marinascorr -targets)/targets*100, bins =100, range=(0,100),label='marinas correction',alpha=0.6)
0201 plt.hist(100*abs(preds -targets)/targets, bins =100, range=(0,100),label='PU correction',alpha=0.6)
0202 #plt.yscale('log')
0203 plt.title("error distribution")
0204 plt.legend(loc='upper right')
0205 plt.savefig(modv+'_errors.png')
0206 plt.close()
0207
0208 #plt.scatter(targets, marinascorr,alpha=0.3,label='marinascorr')
0209 plt.scatter(targets, uncorrected,alpha=0.3,label='uncorr')
0210 plt.scatter(targets, preds,alpha=0.3,label='PUcorr')
0211 plt.xlabel('True Values ')
0212 plt.ylabel('Predictions ')
0213 plt.legend(loc='upper right')
0214 lims = [0, 200]
0215 plt.xlim(lims)
0216 plt.ylim(lims)
0217 _ = plt.plot(lims, lims)
0218 plt.savefig(modv+'_energyscatt.png')
0219 plt.close()
0220
0221
0222
0223 pmom= X_test[:,13]
0224 #get_ipython().run_line_magic('matplotlib', 'inline')
0225 plt.hist(targets/pmom, bins =100, range=(0,5),label='E/p noPU',alpha=0.6)
0226 #plt.hist(preds/pmom, bins =100, range=(0,5),label='E/p PUcorr',alpha=0.6)
0227 #plt.hist(uncorrected/pmom, bins =100, range=(0,5),label='E/p uncorr',alpha=0.6)
0228 plt.hist(marinascorr/pmom, bins =100, range=(0,5),label='E/p marina corr',alpha=0.6)
0229 #plt.hist(np.exp(preds.reshape((preds.shape[0])))/pmom[0:n_test_events], bins =100, range=(0,5),label='corrEnp/p',alpha=0.3)
0230 #plt.hist(preds.reshape((preds.shape[0]))/pmom[0:n_test_events], bins =100, range=(0,5),label='corrEnp/p',alpha=0.3)
0231 plt.legend(loc='upper right')
0232 plt.yscale('log')
0233 plt.title("E/p distributions")
0234 plt.savefig(modv+'_eopdist.png')
0235 plt.close()
0236
0237 # In[8]:
0238
0239
0240 import os
0241 ############## save model
0242 if not os.path.exists('models'):
0243     os.makedirs('models')
0244 model.save('models/model'+modv+'.h5')
0245 #new_model_2 = load_model('my_model.h5')
0246