Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 11:58:56

0001 ######################################################################################
0002 # Trains regressor and saves model for evaluation
0003 # Usage:
0004 # python3 isotrackTrainRegressor.py -I isotk_relval_hi.pkl  -V 1 
0005 # python3 isotrackTrainRegressor.py -I isotk_relval_lo.pkl  -V 2
0006 ######################################################################################
0007 
0008 
0009 import pandas as pd
0010 import numpy as np
0011 import matplotlib.pyplot as plt
0012 import argparse
0013 import tensorflow.keras
0014 from tensorflow.keras.models import Sequential
0015 from tensorflow.keras.layers import Dense
0016 from tensorflow.keras.layers import Dropout
0017 from tensorflow.keras.utils import to_categorical
0018 from tensorflow.keras.utils import plot_model
0019 from tensorflow.keras import regularizers
0020 from sklearn.metrics import roc_curve, auc
0021 from tensorflow.keras.layers import Activation
0022 from tensorflow.keras import backend as K
0023 from tensorflow.keras.models import save_model
0024 
0025 
0026 
0027 parser = argparse.ArgumentParser()
0028 parser.add_argument("-I", "--input",help="input file",default="isotk_relval_hi.pkl")
0029 parser.add_argument("-V", "--modelv",help="model version (any number) ",default="1")
0030 
0031 
0032 
0033 fName = parser.parse_args().input
0034 modv = parser.parse_args().modelv
0035 # In[2]:
0036 
0037 
0038 df = pd.read_pickle(fName)
0039 print ("vars in file:",df.keys())
0040 print("events in df original:",df.shape[0])
0041 df = df.loc[df['t_eHcal_y'] > 20]
0042 print("events in df after energy cut:",df.shape[0])
0043 df['t_eHcal_xun'] = df['t_eHcal_x']
0044 df['t_delta_un'] = df['t_delta']
0045 df['t_ieta_un'] = df['t_ieta']
0046 
0047 mina = []
0048 maxa = []
0049 keya = []
0050 
0051 
0052 for i in df.keys():
0053     keya.append(i)
0054     mina.append(df[i].min())
0055     maxa.append(df[i].max())
0056 
0057 print('var = ',keya)
0058 print('mina = ',mina)
0059 print('maxa = ',maxa)
0060 
0061 
0062 #cols_to_stand = ['t_nVtx','t_ieta','t_eHcal10', 't_eHcal30','t_rhoh','t_eHcal_x']
0063 #cols_to_minmax = ['t_delta', 't_hmaxNearP','t_emaxNearP', 't_hAnnular', 't_eAnnular','t_pt']
0064 cols_to_minmax = ['t_delta', 't_hmaxNearP','t_emaxNearP', 't_hAnnular', 't_eAnnular','t_pt','t_nVtx','t_ieta','t_eHcal10', 't_eHcal30','t_rhoh','t_eHcal_x']
0065 #df[cols_to_stand] = df[cols_to_stand].apply(lambda x: (x - x.mean()) /(x.std()))
0066 #df[cols_to_minmax] = df[cols_to_minmax].apply(lambda x: (x - x.mean()) /  (x.max() - x.min()))
0067 #                                            #(x.max() - x.min()))
0068 df[cols_to_minmax] = df[cols_to_minmax].apply(lambda x: (x - x.min()) /  (x.max() - x.min()))
0069 
0070 data = df.values
0071 print ('data shape:',data.shape)
0072 targets = data[:,12]
0073 targets.shape
0074 
0075 
0076 # In[3]:
0077 
0078 
0079 data = df.values
0080 ntest = 20000
0081 testindx = data.shape[0] - ntest
0082 X_train = data[:testindx,0:12]
0083 Y_train = data[:testindx,12]
0084 X_test = data[testindx:,:]
0085 print ("shape of X_train:",X_train.shape)
0086 print ("shape of Y_train:",Y_train.shape)
0087 print ("shape of X_test:",X_test.shape)
0088 meany = np.mean(Y_train)
0089 print ("mean y:",meany)
0090 stdy = np.std(Y_train)
0091 print ("std y:", stdy)
0092 
0093 
0094 # In[4]:
0095 
0096 
0097 ############################################# marinas correction  form
0098 a0 = [0.973, 0.998,  0.992,  0.965 ]
0099 a1 =  [0,    -0.318, -0.261, -0.406]
0100 a2 = [0,     0,      0.047,  0.089]
0101 def fac0(jeta):
0102     PU_IETA_1 = 9
0103     PU_IETA_2 = 16
0104     PU_IETA_3 = 25
0105     idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0106     return a0[idx]
0107 def fac1(jeta):
0108     PU_IETA_1 = 9
0109     PU_IETA_2 = 16
0110     PU_IETA_3 = 25
0111     idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0112     return a1[idx]
0113 def fac2(jeta):
0114     PU_IETA_1 = 9
0115     PU_IETA_2 = 16
0116     PU_IETA_3 = 25
0117     idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0118     return a2[idx]
0119 
0120 vec0 = np.vectorize(fac0)
0121 vec1 = np.vectorize(fac1)
0122 vec2 = np.vectorize(fac2)
0123 
0124 X_test[:,17]
0125 eop = (X_test[:,15]/X_test[:,13])
0126 dop = (X_test[:,16]/X_test[:,13])
0127 #mcorrval = vec0(abs(X_test[:,17])) + vec1(abs(X_test[:,17]))*(X_test[:,15]/X_test[:,13])*(X_test[:,16]/X_test[:,13])*( 1 + vec2(fac(abs(X_test[:,17])))*(X_test[:,16]/X_test[:,13]))
0128 
0129 mcorrval = vec0(abs(X_test[:,17]))   +  vec1(abs(X_test[:,17]))*eop*dop*( 1   +  vec2(abs(X_test[:,17]))*dop)
0130 
0131 
0132 # In[5]:
0133 
0134 
0135 def propweights(y_true):
0136     weight = np.copy(y_true)
0137     weight[abs(y_true - meany) > 0] = 1.90*abs(y_true - meany)/stdy  #1.25
0138 #    weight[abs(y_true - meany) > stdy] = 1.75*abs((weight[abs(y_true - meany) > stdy]) - meany)/(stdy)
0139     weight[abs(y_true - meany) < stdy] =  1
0140     return weight
0141 
0142 
0143 # In[6]:
0144 
0145 
0146 from tensorflow.keras import optimizers
0147 print ("creating model=========>")
0148 model = Sequential()
0149 model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
0150 model.add(Dropout(0.2))
0151 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0152 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0153 model.add(Dense(60, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0154 model.add(Dense(1))
0155 
0156 RMS = tensorflow.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
0157 # Compile model
0158 print ("compilation up next=======>")
0159 model.compile(loss='logcosh',optimizer='adam')
0160 model.summary()
0161 #fitting
0162 print ("fitting now=========>")
0163 history = model.fit(X_train,Y_train , batch_size=5000, epochs=100, validation_split=0.2, verbose=1,sample_weight=propweights(Y_train))
0164 
0165 
0166 # In[7]:
0167 plt.plot(history.history['loss'])
0168 plt.plot(history.history['val_loss'])
0169 plt.title('model loss')
0170 plt.ylabel('loss')
0171 plt.xlabel('epoch')
0172 plt.legend(['train', 'validation'], loc='upper left')
0173 plt.savefig(modv+'_loss_distributions.png')
0174 plt.close()
0175 
0176 
0177 preds = model.predict(X_test[:,0:12])
0178 targets = X_test[:,12]
0179 uncorrected = X_test[:,15]
0180 marinascorr = X_test[:,15]*mcorrval
0181 plt.hist(preds, bins =100, range=(0,200),label='PU regression',alpha=0.6)
0182 plt.hist(targets, bins =100, range=(0,200),label='no PU',alpha=0.6)
0183 plt.hist(uncorrected, bins =100, range=(0,200),label='uncorrected',alpha=0.6)
0184 #plt.hist(marinascorr, bins =100, range=(0,200),label='marinas correction',alpha=0.6)
0185 plt.yscale('log')
0186 plt.title("Energy distribution")
0187 plt.legend(loc='upper right')
0188 plt.savefig(modv+'_energy_distributions.png')
0189 plt.close()
0190 
0191 preds = preds.reshape(preds.shape[0])
0192 print (preds.shape)
0193 
0194 
0195 
0196 #plt.hist(preds, bins =100, range=(0,100),label='PU regression',alpha=0.6)
0197 #plt.hist(abs(preds -targets)/targets , bins =100, range=(0,40),label='PU regression',alpha=0.6)
0198 #plt.hist(targets, bins =100, range=(0,100),label='no PU',alpha=0.6)
0199 plt.hist(abs(uncorrected -targets)/targets*100, bins =100, range=(0,100),label='uncorrected',alpha=0.6)
0200 #plt.hist(abs(marinascorr -targets)/targets*100, bins =100, range=(0,100),label='marinas correction',alpha=0.6)
0201 plt.hist(100*abs(preds -targets)/targets, bins =100, range=(0,100),label='PU correction',alpha=0.6)
0202 #plt.yscale('log')
0203 plt.title("error distribution")
0204 plt.legend(loc='upper right')
0205 plt.savefig(modv+'_errors.png')
0206 plt.close()
0207 
0208 #plt.scatter(targets, marinascorr,alpha=0.3,label='marinascorr')
0209 plt.scatter(targets, uncorrected,alpha=0.3,label='uncorr')
0210 plt.scatter(targets, preds,alpha=0.3,label='PUcorr')
0211 plt.xlabel('True Values ')
0212 plt.ylabel('Predictions ')
0213 plt.legend(loc='upper right')
0214 lims = [0, 200]
0215 plt.xlim(lims)
0216 plt.ylim(lims)
0217 _ = plt.plot(lims, lims)
0218 plt.savefig(modv+'_energyscatt.png')
0219 plt.close()
0220 
0221 
0222 
0223 pmom= X_test[:,13]
0224 #get_ipython().run_line_magic('matplotlib', 'inline')
0225 plt.hist(targets/pmom, bins =100, range=(0,5),label='E/p noPU',alpha=0.6)
0226 #plt.hist(preds/pmom, bins =100, range=(0,5),label='E/p PUcorr',alpha=0.6)
0227 #plt.hist(uncorrected/pmom, bins =100, range=(0,5),label='E/p uncorr',alpha=0.6)
0228 plt.hist(marinascorr/pmom, bins =100, range=(0,5),label='E/p marina corr',alpha=0.6)
0229 #plt.hist(np.exp(preds.reshape((preds.shape[0])))/pmom[0:n_test_events], bins =100, range=(0,5),label='corrEnp/p',alpha=0.3)
0230 #plt.hist(preds.reshape((preds.shape[0]))/pmom[0:n_test_events], bins =100, range=(0,5),label='corrEnp/p',alpha=0.3)
0231 plt.legend(loc='upper right')
0232 plt.yscale('log')
0233 plt.title("E/p distributions") 
0234 plt.savefig(modv+'_eopdist.png')
0235 plt.close()
0236 
0237 # In[8]:
0238 
0239 
0240 import os
0241 ############## save model
0242 if not os.path.exists('models'):
0243     os.makedirs('models')
0244 model.save('models/model'+modv+'.h5')
0245 #new_model_2 = load_model('my_model.h5')
0246