File indexing completed on 2024-04-06 11:58:56
0001
0002
0003
0004
0005
0006
0007
0008
0009 import pandas as pd
0010 import numpy as np
0011 import matplotlib.pyplot as plt
0012 import argparse
0013 import tensorflow.keras
0014 from tensorflow.keras.models import Sequential
0015 from tensorflow.keras.layers import Dense
0016 from tensorflow.keras.layers import Dropout
0017 from tensorflow.keras.utils import to_categorical
0018 from tensorflow.keras.utils import plot_model
0019 from tensorflow.keras import regularizers
0020 from sklearn.metrics import roc_curve, auc
0021 from tensorflow.keras.layers import Activation
0022 from tensorflow.keras import backend as K
0023 from tensorflow.keras.models import save_model
0024
0025
0026
0027 parser = argparse.ArgumentParser()
0028 parser.add_argument("-I", "--input",help="input file",default="isotk_relval_hi.pkl")
0029 parser.add_argument("-V", "--modelv",help="model version (any number) ",default="1")
0030
0031
0032
0033 fName = parser.parse_args().input
0034 modv = parser.parse_args().modelv
0035
0036
0037
0038 df = pd.read_pickle(fName)
0039 print ("vars in file:",df.keys())
0040 print("events in df original:",df.shape[0])
0041 df = df.loc[df['t_eHcal_y'] > 20]
0042 print("events in df after energy cut:",df.shape[0])
0043 df['t_eHcal_xun'] = df['t_eHcal_x']
0044 df['t_delta_un'] = df['t_delta']
0045 df['t_ieta_un'] = df['t_ieta']
0046
0047 mina = []
0048 maxa = []
0049 keya = []
0050
0051
0052 for i in df.keys():
0053 keya.append(i)
0054 mina.append(df[i].min())
0055 maxa.append(df[i].max())
0056
0057 print('var = ',keya)
0058 print('mina = ',mina)
0059 print('maxa = ',maxa)
0060
0061
0062
0063
0064 cols_to_minmax = ['t_delta', 't_hmaxNearP','t_emaxNearP', 't_hAnnular', 't_eAnnular','t_pt','t_nVtx','t_ieta','t_eHcal10', 't_eHcal30','t_rhoh','t_eHcal_x']
0065
0066
0067
0068 df[cols_to_minmax] = df[cols_to_minmax].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
0069
0070 data = df.values
0071 print ('data shape:',data.shape)
0072 targets = data[:,12]
0073 targets.shape
0074
0075
0076
0077
0078
0079 data = df.values
0080 ntest = 20000
0081 testindx = data.shape[0] - ntest
0082 X_train = data[:testindx,0:12]
0083 Y_train = data[:testindx,12]
0084 X_test = data[testindx:,:]
0085 print ("shape of X_train:",X_train.shape)
0086 print ("shape of Y_train:",Y_train.shape)
0087 print ("shape of X_test:",X_test.shape)
0088 meany = np.mean(Y_train)
0089 print ("mean y:",meany)
0090 stdy = np.std(Y_train)
0091 print ("std y:", stdy)
0092
0093
0094
0095
0096
0097
0098 a0 = [0.973, 0.998, 0.992, 0.965 ]
0099 a1 = [0, -0.318, -0.261, -0.406]
0100 a2 = [0, 0, 0.047, 0.089]
0101 def fac0(jeta):
0102 PU_IETA_1 = 9
0103 PU_IETA_2 = 16
0104 PU_IETA_3 = 25
0105 idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0106 return a0[idx]
0107 def fac1(jeta):
0108 PU_IETA_1 = 9
0109 PU_IETA_2 = 16
0110 PU_IETA_3 = 25
0111 idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0112 return a1[idx]
0113 def fac2(jeta):
0114 PU_IETA_1 = 9
0115 PU_IETA_2 = 16
0116 PU_IETA_3 = 25
0117 idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
0118 return a2[idx]
0119
0120 vec0 = np.vectorize(fac0)
0121 vec1 = np.vectorize(fac1)
0122 vec2 = np.vectorize(fac2)
0123
0124 X_test[:,17]
0125 eop = (X_test[:,15]/X_test[:,13])
0126 dop = (X_test[:,16]/X_test[:,13])
0127
0128
0129 mcorrval = vec0(abs(X_test[:,17])) + vec1(abs(X_test[:,17]))*eop*dop*( 1 + vec2(abs(X_test[:,17]))*dop)
0130
0131
0132
0133
0134
0135 def propweights(y_true):
0136 weight = np.copy(y_true)
0137 weight[abs(y_true - meany) > 0] = 1.90*abs(y_true - meany)/stdy
0138
0139 weight[abs(y_true - meany) < stdy] = 1
0140 return weight
0141
0142
0143
0144
0145
0146 from tensorflow.keras import optimizers
0147 print ("creating model=========>")
0148 model = Sequential()
0149 model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
0150 model.add(Dropout(0.2))
0151 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0152 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0153 model.add(Dense(60, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
0154 model.add(Dense(1))
0155
0156 RMS = tensorflow.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
0157
0158 print ("compilation up next=======>")
0159 model.compile(loss='logcosh',optimizer='adam')
0160 model.summary()
0161
0162 print ("fitting now=========>")
0163 history = model.fit(X_train,Y_train , batch_size=5000, epochs=100, validation_split=0.2, verbose=1,sample_weight=propweights(Y_train))
0164
0165
0166
0167 plt.plot(history.history['loss'])
0168 plt.plot(history.history['val_loss'])
0169 plt.title('model loss')
0170 plt.ylabel('loss')
0171 plt.xlabel('epoch')
0172 plt.legend(['train', 'validation'], loc='upper left')
0173 plt.savefig(modv+'_loss_distributions.png')
0174 plt.close()
0175
0176
0177 preds = model.predict(X_test[:,0:12])
0178 targets = X_test[:,12]
0179 uncorrected = X_test[:,15]
0180 marinascorr = X_test[:,15]*mcorrval
0181 plt.hist(preds, bins =100, range=(0,200),label='PU regression',alpha=0.6)
0182 plt.hist(targets, bins =100, range=(0,200),label='no PU',alpha=0.6)
0183 plt.hist(uncorrected, bins =100, range=(0,200),label='uncorrected',alpha=0.6)
0184
0185 plt.yscale('log')
0186 plt.title("Energy distribution")
0187 plt.legend(loc='upper right')
0188 plt.savefig(modv+'_energy_distributions.png')
0189 plt.close()
0190
0191 preds = preds.reshape(preds.shape[0])
0192 print (preds.shape)
0193
0194
0195
0196
0197
0198
0199 plt.hist(abs(uncorrected -targets)/targets*100, bins =100, range=(0,100),label='uncorrected',alpha=0.6)
0200
0201 plt.hist(100*abs(preds -targets)/targets, bins =100, range=(0,100),label='PU correction',alpha=0.6)
0202
0203 plt.title("error distribution")
0204 plt.legend(loc='upper right')
0205 plt.savefig(modv+'_errors.png')
0206 plt.close()
0207
0208
0209 plt.scatter(targets, uncorrected,alpha=0.3,label='uncorr')
0210 plt.scatter(targets, preds,alpha=0.3,label='PUcorr')
0211 plt.xlabel('True Values ')
0212 plt.ylabel('Predictions ')
0213 plt.legend(loc='upper right')
0214 lims = [0, 200]
0215 plt.xlim(lims)
0216 plt.ylim(lims)
0217 _ = plt.plot(lims, lims)
0218 plt.savefig(modv+'_energyscatt.png')
0219 plt.close()
0220
0221
0222
0223 pmom= X_test[:,13]
0224
0225 plt.hist(targets/pmom, bins =100, range=(0,5),label='E/p noPU',alpha=0.6)
0226
0227
0228 plt.hist(marinascorr/pmom, bins =100, range=(0,5),label='E/p marina corr',alpha=0.6)
0229
0230
0231 plt.legend(loc='upper right')
0232 plt.yscale('log')
0233 plt.title("E/p distributions")
0234 plt.savefig(modv+'_eopdist.png')
0235 plt.close()
0236
0237
0238
0239
0240 import os
0241
0242 if not os.path.exists('models'):
0243 os.makedirs('models')
0244 model.save('models/model'+modv+'.h5')
0245
0246