File indexing completed on 2024-11-26 02:34:35
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 from builtins import range
0013 import array
0014 import os
0015 import re
0016 import sys
0017 from pickle import load
0018 from os.path import dirname,basename,join,isfile
0019 from threading import Thread
0020 from time import asctime
0021
0022 theargv=sys.argv
0023 sys.argv=[]
0024 import ROOT
0025 ROOT.gErrorIgnoreLevel=1001
0026 ROOT.gROOT.SetBatch(True)
0027 sys.argv=theargv
0028
0029 if sys.version_info[0]==2:
0030 from urllib2 import Request,build_opener,urlopen
0031 else:
0032 from urllib.request import Request,build_opener,urlopen
0033
0034 if "RELMON_SA" in os.environ:
0035 from .definitions import *
0036 from .authentication import X509CertOpen
0037 from .utils import __file__ as this_module_name
0038 else:
0039 from Utilities.RelMon.definitions import *
0040 from Utilities.RelMon.authentication import X509CertOpen
0041 from Utilities.RelMon.utils import __file__ as this_module_name
0042
0043
0044
0045
0046 _log_level=10
0047 def logger(msg_level,message):
0048 if msg_level>=_log_level:
0049 print("[%s] %s" %(asctime(),message))
0050
0051
0052 def setTDRStyle():
0053 this_dir=dirname(this_module_name)
0054 this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
0055
0056 style_file=''
0057 if "RELMON_SA" in os.environ:
0058 style_file=this_dir_one_up+"data/tdrstyle_mod.C"
0059 else:
0060 style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
0061 try:
0062 gROOT.ProcessLine(".L %s" %style_file)
0063 gROOT.ProcessLine("setTDRStyle()")
0064 except:
0065 "Print could not set the TDR style. File %s not found?" %style_file
0066
0067
0068
0069 def literal2root (literal,rootType):
0070 bitsarray = array.array('B')
0071 bitsarray.fromstring(literal.decode('hex'))
0072
0073 tbuffer=0
0074 try:
0075 tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
0076 except:
0077 print("could not transform to object array:")
0078 print([ i for i in bitsarray ])
0079
0080
0081 if rootType == 'TPROF':
0082 rootType = 'TProfile'
0083 if rootType == 'TPROF2D':
0084 rootType = 'TProfile2D'
0085
0086 root_class=eval(rootType+'.Class()')
0087
0088 return tbuffer.ReadObject(root_class)
0089
0090
0091
0092 def getNbins(h):
0093 """
0094 To be used in loops on bin number with range()
0095 For each dimension there are GetNbinsX()+2 bins including underflow
0096 and overflow, and range() loops starts from 0. So the total number
0097 of bins as upper limit of a range() loop already includes the next
0098 to last value needed.
0099 """
0100 biny=h.GetNbinsY()
0101 if biny>1: biny+=2
0102 binz=h.GetNbinsZ()
0103 if binz>1:binz+=2
0104 return (h.GetNbinsX()+2)*(biny)*(binz)
0105
0106
0107
0108
0109 class StatisticalTest(object):
0110 def __init__(self,threshold):
0111 self.name=""
0112 self.h1=None
0113 self.h2=None
0114 self.threshold=float(threshold)
0115 self.rank=-1
0116 self.is_init=False
0117
0118 def set_operands(self,h1,h2):
0119 self.h1=h1
0120 self.h2=h2
0121
0122 def get_rank(self):
0123 if not self.is_init:
0124 if self.rank < 0:
0125 type1=type(self.h1)
0126 type2=type(self.h2)
0127 if (type1 != type2):
0128 logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
0129 self.rank=test_codes["DIFF_TYPES"]
0130 elif not self.h2.InheritsFrom("TH1"):
0131 logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
0132 self.rank=test_codes["NO_HIST"]
0133
0134
0135
0136
0137 else:
0138 is_empty1=is_empty(self.h1)
0139 is_empty2=is_empty(self.h2)
0140 are_empty=is_empty1 and is_empty2
0141 one_empty=is_empty1 or is_empty2
0142
0143 Nbins1= getNbins(self.h1)
0144 Nbins2= getNbins(self.h2)
0145
0146 if are_empty:
0147
0148
0149 return 1
0150 elif one_empty:
0151
0152 return 0
0153
0154
0155 if Nbins1!=Nbins2:
0156 return test_codes["DIFF_BIN"]
0157
0158 self.rank=self.do_test()
0159 self.is_init=True
0160 return self.rank
0161
0162 def get_status(self):
0163 status = SUCCESS
0164 if self.get_rank()<0:
0165 status=NULL
0166 logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
0167 elif self.get_rank() < self.threshold:
0168 status=FAIL
0169 logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
0170 return status
0171
0172 def do_test(self):
0173 pass
0174
0175
0176
0177 def is_empty(h):
0178 for i in range(0,getNbins(h)):
0179 if h.GetBinContent(i)!=0: return False
0180 return True
0181
0182
0183
0184
0185 def is_sparse(h):
0186 filled_bins=0.
0187 nbins=h.GetNbinsX()
0188 for ibin in range(0,nbins+2):
0189 if h.GetBinContent(ibin)>0:
0190 filled_bins+=1
0191
0192 if filled_bins/nbins < .5:
0193 return True
0194 else:
0195 return False
0196
0197
0198
0199 class KS(StatisticalTest):
0200 def __init__(self, threshold):
0201 StatisticalTest.__init__(self,threshold)
0202 self.name="KS"
0203
0204 def do_test(self):
0205
0206
0207 for h in self.h1,self.h2:
0208 w2s=h.GetSumw2()
0209 if w2s.GetSize()==0:
0210 h.Sumw2()
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222 return self.h1.KolmogorovTest(self.h2)
0223
0224
0225 import array
0226 def profile2histo(profile):
0227 if not profile.InheritsFrom("TH1"):
0228 return profile
0229
0230 bin_low_edges=[]
0231 n_bins=profile.GetNbinsX()
0232
0233 for ibin in range(1,n_bins+2):
0234 bin_low_edges.append(profile.GetBinLowEdge(ibin))
0235 bin_low_edges=array.array('f',bin_low_edges)
0236 histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
0237 for ibin in range(0,n_bins+2):
0238 histo.SetBinContent(ibin,profile.GetBinContent(ibin))
0239 histo.SetBinError(ibin,profile.GetBinError(ibin))
0240
0241 return histo
0242
0243
0244 class Chi2(StatisticalTest):
0245 def __init__(self, threshold):
0246 StatisticalTest.__init__(self,threshold)
0247 self.name="Chi2"
0248
0249 def check_filled_bins(self,min_filled):
0250 nbins=self.h1.GetNbinsX()
0251 n_filled_l=[]
0252 for h in self.h1,self.h2:
0253 nfilled=0.
0254 for ibin in range(0,nbins+2):
0255 if h.GetBinContent(ibin)>0:
0256 nfilled+=1
0257 n_filled_l.append(nfilled)
0258 return len([x for x in n_filled_l if x>=min_filled] )>0
0259
0260 def absval(self):
0261 nbins=getNbins(self.h1)
0262 binc=0
0263 for i in range(0,nbins):
0264 for h in self.h1,self.h2:
0265 binc=h.GetBinContent(i)
0266 if binc<0:
0267 h.SetBinContent(i,-1*binc)
0268 if h.GetBinError(i)==0 and binc!=0:
0269
0270 h.SetBinContent(i,0)
0271
0272 def check_histograms(self, histogram):
0273 if histogram.InheritsFrom("TProfile") or (histogram.GetEntries()!=histogram.GetSumOfWeights()):
0274 return 'W'
0275 else:
0276 return 'U'
0277
0278 def do_test(self):
0279 self.absval()
0280 if self.check_filled_bins(3):
0281
0282
0283
0284
0285
0286
0287 hist1 = self.check_histograms(self.h1)
0288 hist2 = self.check_histograms(self.h2)
0289 if hist1 =='W' and hist2 =='W':
0290 chi2 = self.h1.Chi2Test(self.h2,'WW')
0291 return chi2
0292 elif hist1 == 'U' and hist2 == 'U':
0293 chi2 = self.h1.Chi2Test(self.h2,'UU')
0294 return chi2
0295 elif hist1 == 'U' and hist2 == 'W':
0296 chi2 = self.h1.Chi2Test(self.h2,'UW')
0297 return chi2
0298 elif hist1 == 'W' and hist2 == 'U':
0299 chi2 = self.h2.Chi2Test(self.h1,'UW')
0300 return chi2
0301 else:
0302 return 1
0303
0304
0305
0306
0307 class BinToBin(StatisticalTest):
0308 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
0309 bins is different. It is % of corresponding bins otherwhise.
0310 A threshold of 1 is needed to require a 1 to 1 correspondance between
0311 hisograms.
0312 """
0313 def __init__(self, threshold=1):
0314 StatisticalTest.__init__(self, threshold)
0315 self.name='BinToBin'
0316 self.epsilon= 0.000001
0317
0318 def checkBinningMatches(self):
0319 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0320 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0321 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0322 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0323 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0324 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0325 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0326 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0327 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0328 return False
0329 return True
0330
0331 def do_test(self):
0332
0333 if not self.checkBinningMatches():
0334 return test_codes["DIFF_BIN"]
0335
0336 equal = 1
0337 nbins = getNbins(self.h1)
0338 n_ok_bins=0.0
0339 for ibin in range(0, nbins):
0340 h1bin=self.h1.GetBinContent(ibin)
0341 h2bin=self.h2.GetBinContent(ibin)
0342 bindiff=h1bin-h2bin
0343
0344 binavg=.5*(h1bin+h2bin)
0345
0346 if binavg==0 or abs(bindiff) < self.epsilon:
0347 n_ok_bins+=1
0348
0349 else:
0350 print("Bin %ibin: bindiff %s" %(ibin,bindiff))
0351
0352
0353
0354
0355 rank=n_ok_bins/nbins
0356
0357 if rank!=1:
0358 print("Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0359
0360 return rank
0361
0362
0363
0364 class BinToBin1percent(StatisticalTest):
0365 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
0366 bins is different. It is % of corresponding bins otherwhise.
0367 A threshold of 1 is needed to require a 1 to 1 correspondance between
0368 hisograms.
0369 """
0370 def __init__(self, threshold=1):
0371 StatisticalTest.__init__(self, threshold)
0372 self.name='BinToBin1percent'
0373 self.epsilon= 0.000001
0374 self.tolerance= 0.01
0375
0376 def checkBinningMatches(self):
0377 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0378 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0379 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0380 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0381 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0382 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0383 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0384 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0385 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0386 return False
0387 return True
0388
0389 def do_test(self):
0390
0391 if not self.checkBinningMatches():
0392 return test_codes["DIFF_BIN"]
0393
0394 equal = 1
0395 nbins = getNbins(self.h1)
0396 n_ok_bins=0.0
0397 for ibin in range(0,nbins):
0398 ibin+=1
0399 h1bin=self.h1.GetBinContent(ibin)
0400 h2bin=self.h2.GetBinContent(ibin)
0401 bindiff=h1bin-h2bin
0402
0403 binavg=.5*(h1bin+h2bin)
0404
0405 if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
0406 n_ok_bins+=1
0407
0408 else:
0409 print("-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin))
0410
0411
0412
0413
0414 rank=n_ok_bins/nbins
0415
0416 if rank!=1:
0417 print("%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0418
0419 return rank
0420
0421 Statistical_Tests={"KS":KS,
0422 "Chi2":Chi2,
0423 "BinToBin":BinToBin,
0424 "BinToBin1percent":BinToBin1percent,
0425 "Bin2Bin":BinToBin,
0426 "b2b":BinToBin,}
0427
0428
0429 def ask_ok(prompt, retries=4, complaint='yes or no'):
0430 while True:
0431 ok = raw_input(prompt)
0432 if ok in ('y', 'ye', 'yes'):
0433 return True
0434 if ok in ('n', 'no'):
0435 return False
0436 retries = retries - 1
0437 if retries < 0:
0438 raise IOError('refusenik user')
0439 print(complaint)
0440
0441
0442
0443 class unpickler(Thread):
0444 def __init__(self,filename):
0445 Thread.__init__(self)
0446 self.filename=filename
0447 self.directory=""
0448
0449 def run(self):
0450 print("Reading directory from %s" %(self.filename))
0451 ifile=open(self.filename,"rb")
0452 self.directory=load(ifile)
0453 ifile.close()
0454
0455
0456
0457 def wget(url):
0458 """ Fetch the WHOLE file, not in bunches... To be optimised.
0459 """
0460 opener=build_opener(X509CertOpen())
0461 datareq = Request(url)
0462 datareq.add_header('authenticated_wget', "The ultimate wgetter")
0463 bin_content=None
0464 try:
0465 filename=basename(url)
0466 print("Checking existence of file %s on disk..."%filename)
0467 if not isfile("./%s"%filename):
0468 bin_content=opener.open(datareq).read()
0469 else:
0470 print("File %s exists, skipping.." %filename)
0471 except ValueError:
0472 print("Error: Unknown url %s" %url)
0473
0474 if bin_content!=None:
0475 ofile = open(filename, 'wb')
0476 ofile.write(bin_content)
0477 ofile.close()
0478
0479
0480
0481
0482 def get_relvaldata_id(file):
0483 """Returns unique relvaldata ID for a given file."""
0484 run_id = re.search('R\\d{9}', file)
0485 run = re.search('_RelVal_([\\w\\d]*)-v\\d__', file)
0486 if not run:
0487 run = re.search('GR_R_\\d*_V\\d*C?_([\\w\\d]*)-v\\d__', file)
0488 if run_id and run:
0489 return (run_id.group(), run.group(1))
0490 return None
0491
0492 def get_relvaldata_cmssw_version(file):
0493 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0494 cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0495 gr_r_version = re.findall('-(GR_R_\\d*_V\\d*\\w?)(?:_RelVal)?_', file)
0496 if not gr_r_version:
0497 gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-(\\w*)_RelVal_', file)
0498 if cmssw_release and gr_r_version:
0499 return (cmssw_release[0], gr_r_version[0])
0500
0501 def get_relvaldata_version(file):
0502 """Returns tuple (CMSSW version, run version) for specified file."""
0503 cmssw_version = re.findall('DQM_V(\\d*)_', file)
0504 run_version = re.findall('_RelVal_[\\w\\d]*-v(\\d)__', file)
0505 if not run_version:
0506 run_version = re.findall('GR_R_\\d*_V\\d*C?_[\\w\\d]*-v(\\d)__', file)
0507 if cmssw_version and run_version:
0508 return (int(cmssw_version[0]), int(run_version[0]))
0509
0510 def get_relvaldata_max_version(files):
0511 """Returns file with maximum version at a) beggining of the file,
0512 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0513 max_file = files[0]
0514 max_v = get_relvaldata_version(files[0])
0515 for file in files:
0516 file_v = get_relvaldata_version(file)
0517 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0518 max_file = file
0519 max_v = file_v
0520 return max_file
0521
0522
0523 def get_relval_version(file):
0524 """Returns tuple (CMSSW version, run version) for specified file."""
0525 cmssw_version = re.findall('DQM_V(\\d*)_', file)
0526 run_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-[\\w\\d]*_V\\d*\\w?(?:_[\\w\\d]*)?-v(\\d*)__', file)
0527 if cmssw_version and run_version:
0528 return (int(cmssw_version[0]), int(run_version[0]))
0529
0530 def get_relval_max_version(files):
0531 """Returns file with maximum version at a) beggining of the file,
0532 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0533 max_file = files[0]
0534 max_v = get_relval_version(files[0])
0535 for file in files:
0536 file_v = get_relval_version(file)
0537 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0538 max_file = file
0539 max_v = file_v
0540 return max_file
0541
0542 def get_relval_cmssw_version(file):
0543 cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0544 gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-([\\w\\d]*)_V\\d*\\w?(_[\\w\\d]*)?-v', file)
0545 if cmssw_release and gr_r_version:
0546 if "PU" in gr_r_version[0][0] and not "FastSim" in file:
0547 __gt = re.sub('^[^_]*_', "", gr_r_version[0][0])
0548 __process_string = gr_r_version[0][1]
0549 return (__gt, __process_string)
0550 elif "PU" in gr_r_version[0][0] and "FastSim" in file:
0551 return (cmssw_release[0], "PU_")
0552 return (cmssw_release[0], gr_r_version[0])
0553
0554 def get_relval_id(file):
0555 """Returns unique relval ID (dataset name) for a given file."""
0556 dataset_name = re.findall('R\\d{9}__([\\w\\D]*)__CMSSW_', file)
0557 __process_string = re.search('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-([\\w\\d]*)_V\\d*\\w?(_[\\w\\d]*)?-v', file)
0558 _ps = ""
0559 if __process_string:
0560 if "PU" in __process_string.group(1) and not "FastSim" in file:
0561 _ps = re.search('^[^_]*_', __process_string.group(1)).group()
0562 elif "PU" in __process_string.group(1) and "FastSim" in file:
0563 return dataset_name[0]+"_", _ps
0564 return dataset_name[0], _ps
0565
0566
0567 def is_relvaldata(files):
0568 is_relvaldata_re = re.compile('_RelVal_')
0569 return any([is_relvaldata_re.search(filename) for filename in files])
0570
0571 def make_files_pairs(files, verbose=True):
0572
0573 if is_relvaldata(files):
0574 is_relval_data = True
0575 get_cmssw_version = get_relvaldata_cmssw_version
0576 get_id = get_relvaldata_id
0577 get_max_version = get_relvaldata_max_version
0578
0579 else:
0580 is_relval_data = False
0581 get_cmssw_version = get_relval_cmssw_version
0582 get_id = get_relval_id
0583 get_max_version = get_relval_max_version
0584
0585
0586
0587 versions_files = dict()
0588 for file in files:
0589 version = get_cmssw_version(file)
0590 if version in versions_files:
0591 versions_files[version].append(file)
0592 else:
0593 versions_files[version] = [file]
0594
0595
0596 if verbose:
0597 print('\nFound versions:')
0598 for version in versions_files:
0599 print('%s: %d files' % (str(version), len(versions_files[version])))
0600
0601 if len(versions_files) <= 1:
0602 print('\nFound too little versions, there is nothing to pair. Exiting...\n')
0603 exit()
0604
0605
0606 versions = versions_files.keys()
0607 sizes = [len(value) for value in versions_files.values()]
0608 v1 = versions[sizes.index(max(sizes))]
0609 versions.remove(v1)
0610 sizes.remove(max(sizes))
0611 v2 = versions[sizes.index(max(sizes))]
0612
0613
0614 if verbose:
0615 print('\nPairing %s (%d files) and %s (%d files)' % (str(v1),
0616 len(versions_files[v1]), str(v2), len(versions_files[v2])))
0617
0618
0619 print('\nGot pairs:')
0620 pairs = []
0621 for unique_id in set([get_id(file) for file in versions_files[v1]]):
0622 if is_relval_data:
0623 dataset_re = re.compile(unique_id[0]+'_')
0624 run_re = re.compile(unique_id[1])
0625 c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
0626 c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
0627 else:
0628 dataset_re = re.compile(unique_id[0]+'_')
0629 ps_re = re.compile(unique_id[1])
0630
0631 c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and ps_re.search(file)]
0632 c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and ps_re.search(file)]
0633
0634 if len(c1_files) > 0 and len(c2_files) > 0:
0635 first_file = get_max_version(c1_files)
0636 second_file = get_max_version(c2_files)
0637 print('%s\n%s\n' % (first_file, second_file))
0638 pairs.extend((first_file, second_file))
0639 if verbose:
0640 print("Paired and got %d files.\n" % len(pairs))
0641 return pairs