File indexing completed on 2024-04-06 12:31:49
0001 from __future__ import print_function
0002 from __future__ import absolute_import
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 from builtins import range
0015 import array
0016 import os
0017 import re
0018 import sys
0019 from pickle import load
0020 from os.path import dirname,basename,join,isfile
0021 from threading import Thread
0022 from time import asctime
0023
0024 theargv=sys.argv
0025 sys.argv=[]
0026 import ROOT
0027 ROOT.gErrorIgnoreLevel=1001
0028 ROOT.gROOT.SetBatch(True)
0029 sys.argv=theargv
0030
0031 if sys.version_info[0]==2:
0032 from urllib2 import Request,build_opener,urlopen
0033 else:
0034 from urllib.request import Request,build_opener,urlopen
0035
0036 if "RELMON_SA" in os.environ:
0037 from .definitions import *
0038 from .authentication import X509CertOpen
0039 from .utils import __file__ as this_module_name
0040 else:
0041 from Utilities.RelMon.definitions import *
0042 from Utilities.RelMon.authentication import X509CertOpen
0043 from Utilities.RelMon.utils import __file__ as this_module_name
0044
0045
0046
0047
0048 _log_level=10
0049 def logger(msg_level,message):
0050 if msg_level>=_log_level:
0051 print("[%s] %s" %(asctime(),message))
0052
0053
0054 def setTDRStyle():
0055 this_dir=dirname(this_module_name)
0056 this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
0057
0058 style_file=''
0059 if "RELMON_SA" in os.environ:
0060 style_file=this_dir_one_up+"data/tdrstyle_mod.C"
0061 else:
0062 style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
0063 try:
0064 gROOT.ProcessLine(".L %s" %style_file)
0065 gROOT.ProcessLine("setTDRStyle()")
0066 except:
0067 "Print could not set the TDR style. File %s not found?" %style_file
0068
0069
0070
0071 def literal2root (literal,rootType):
0072 bitsarray = array.array('B')
0073 bitsarray.fromstring(literal.decode('hex'))
0074
0075 tbuffer=0
0076 try:
0077 tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
0078 except:
0079 print("could not transform to object array:")
0080 print([ i for i in bitsarray ])
0081
0082
0083 if rootType == 'TPROF':
0084 rootType = 'TProfile'
0085 if rootType == 'TPROF2D':
0086 rootType = 'TProfile2D'
0087
0088 root_class=eval(rootType+'.Class()')
0089
0090 return tbuffer.ReadObject(root_class)
0091
0092
0093
0094 def getNbins(h):
0095 """
0096 To be used in loops on bin number with range()
0097 For each dimension there are GetNbinsX()+2 bins including underflow
0098 and overflow, and range() loops starts from 0. So the total number
0099 of bins as upper limit of a range() loop already includes the next
0100 to last value needed.
0101 """
0102 biny=h.GetNbinsY()
0103 if biny>1: biny+=2
0104 binz=h.GetNbinsZ()
0105 if binz>1:binz+=2
0106 return (h.GetNbinsX()+2)*(biny)*(binz)
0107
0108
0109
0110
0111 class StatisticalTest(object):
0112 def __init__(self,threshold):
0113 self.name=""
0114 self.h1=None
0115 self.h2=None
0116 self.threshold=float(threshold)
0117 self.rank=-1
0118 self.is_init=False
0119
0120 def set_operands(self,h1,h2):
0121 self.h1=h1
0122 self.h2=h2
0123
0124 def get_rank(self):
0125 if not self.is_init:
0126 if self.rank < 0:
0127 type1=type(self.h1)
0128 type2=type(self.h2)
0129 if (type1 != type2):
0130 logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
0131 self.rank=test_codes["DIFF_TYPES"]
0132 elif not self.h2.InheritsFrom("TH1"):
0133 logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
0134 self.rank=test_codes["NO_HIST"]
0135
0136
0137
0138
0139 else:
0140 is_empty1=is_empty(self.h1)
0141 is_empty2=is_empty(self.h2)
0142 are_empty=is_empty1 and is_empty2
0143 one_empty=is_empty1 or is_empty2
0144
0145 Nbins1= getNbins(self.h1)
0146 Nbins2= getNbins(self.h2)
0147
0148 if are_empty:
0149
0150
0151 return 1
0152 elif one_empty:
0153
0154 return 0
0155
0156
0157 if Nbins1!=Nbins2:
0158 return test_codes["DIFF_BIN"]
0159
0160 self.rank=self.do_test()
0161 self.is_init=True
0162 return self.rank
0163
0164 def get_status(self):
0165 status = SUCCESS
0166 if self.get_rank()<0:
0167 status=NULL
0168 logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
0169 elif self.get_rank() < self.threshold:
0170 status=FAIL
0171 logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
0172 return status
0173
0174 def do_test(self):
0175 pass
0176
0177
0178
0179 def is_empty(h):
0180 for i in range(0,getNbins(h)):
0181 if h.GetBinContent(i)!=0: return False
0182 return True
0183
0184
0185
0186
0187 def is_sparse(h):
0188 filled_bins=0.
0189 nbins=h.GetNbinsX()
0190 for ibin in range(0,nbins+2):
0191 if h.GetBinContent(ibin)>0:
0192 filled_bins+=1
0193
0194 if filled_bins/nbins < .5:
0195 return True
0196 else:
0197 return False
0198
0199
0200
0201 class KS(StatisticalTest):
0202 def __init__(self, threshold):
0203 StatisticalTest.__init__(self,threshold)
0204 self.name="KS"
0205
0206 def do_test(self):
0207
0208
0209 for h in self.h1,self.h2:
0210 w2s=h.GetSumw2()
0211 if w2s.GetSize()==0:
0212 h.Sumw2()
0213
0214
0215
0216
0217
0218
0219
0220
0221
0222
0223
0224 return self.h1.KolmogorovTest(self.h2)
0225
0226
0227 import array
0228 def profile2histo(profile):
0229 if not profile.InheritsFrom("TH1"):
0230 return profile
0231
0232 bin_low_edges=[]
0233 n_bins=profile.GetNbinsX()
0234
0235 for ibin in range(1,n_bins+2):
0236 bin_low_edges.append(profile.GetBinLowEdge(ibin))
0237 bin_low_edges=array.array('f',bin_low_edges)
0238 histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
0239 for ibin in range(0,n_bins+2):
0240 histo.SetBinContent(ibin,profile.GetBinContent(ibin))
0241 histo.SetBinError(ibin,profile.GetBinError(ibin))
0242
0243 return histo
0244
0245
0246 class Chi2(StatisticalTest):
0247 def __init__(self, threshold):
0248 StatisticalTest.__init__(self,threshold)
0249 self.name="Chi2"
0250
0251 def check_filled_bins(self,min_filled):
0252 nbins=self.h1.GetNbinsX()
0253 n_filled_l=[]
0254 for h in self.h1,self.h2:
0255 nfilled=0.
0256 for ibin in range(0,nbins+2):
0257 if h.GetBinContent(ibin)>0:
0258 nfilled+=1
0259 n_filled_l.append(nfilled)
0260 return len([x for x in n_filled_l if x>=min_filled] )>0
0261
0262 def absval(self):
0263 nbins=getNbins(self.h1)
0264 binc=0
0265 for i in range(0,nbins):
0266 for h in self.h1,self.h2:
0267 binc=h.GetBinContent(i)
0268 if binc<0:
0269 h.SetBinContent(i,-1*binc)
0270 if h.GetBinError(i)==0 and binc!=0:
0271
0272 h.SetBinContent(i,0)
0273
0274 def check_histograms(self, histogram):
0275 if histogram.InheritsFrom("TProfile") or (histogram.GetEntries()!=histogram.GetSumOfWeights()):
0276 return 'W'
0277 else:
0278 return 'U'
0279
0280 def do_test(self):
0281 self.absval()
0282 if self.check_filled_bins(3):
0283
0284
0285
0286
0287
0288
0289 hist1 = self.check_histograms(self.h1)
0290 hist2 = self.check_histograms(self.h2)
0291 if hist1 =='W' and hist2 =='W':
0292 chi2 = self.h1.Chi2Test(self.h2,'WW')
0293 return chi2
0294 elif hist1 == 'U' and hist2 == 'U':
0295 chi2 = self.h1.Chi2Test(self.h2,'UU')
0296 return chi2
0297 elif hist1 == 'U' and hist2 == 'W':
0298 chi2 = self.h1.Chi2Test(self.h2,'UW')
0299 return chi2
0300 elif hist1 == 'W' and hist2 == 'U':
0301 chi2 = self.h2.Chi2Test(self.h1,'UW')
0302 return chi2
0303 else:
0304 return 1
0305
0306
0307
0308
0309 class BinToBin(StatisticalTest):
0310 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
0311 bins is different. It is % of corresponding bins otherwhise.
0312 A threshold of 1 is needed to require a 1 to 1 correspondance between
0313 hisograms.
0314 """
0315 def __init__(self, threshold=1):
0316 StatisticalTest.__init__(self, threshold)
0317 self.name='BinToBin'
0318 self.epsilon= 0.000001
0319
0320 def checkBinningMatches(self):
0321 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0322 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0323 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0324 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0325 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0326 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0327 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0328 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0329 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0330 return False
0331 return True
0332
0333 def do_test(self):
0334
0335 if not self.checkBinningMatches():
0336 return test_codes["DIFF_BIN"]
0337
0338 equal = 1
0339 nbins = getNbins(self.h1)
0340 n_ok_bins=0.0
0341 for ibin in range(0, nbins):
0342 h1bin=self.h1.GetBinContent(ibin)
0343 h2bin=self.h2.GetBinContent(ibin)
0344 bindiff=h1bin-h2bin
0345
0346 binavg=.5*(h1bin+h2bin)
0347
0348 if binavg==0 or abs(bindiff) < self.epsilon:
0349 n_ok_bins+=1
0350
0351 else:
0352 print("Bin %ibin: bindiff %s" %(ibin,bindiff))
0353
0354
0355
0356
0357 rank=n_ok_bins/nbins
0358
0359 if rank!=1:
0360 print("Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0361
0362 return rank
0363
0364
0365
0366 class BinToBin1percent(StatisticalTest):
0367 """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
0368 bins is different. It is % of corresponding bins otherwhise.
0369 A threshold of 1 is needed to require a 1 to 1 correspondance between
0370 hisograms.
0371 """
0372 def __init__(self, threshold=1):
0373 StatisticalTest.__init__(self, threshold)
0374 self.name='BinToBin1percent'
0375 self.epsilon= 0.000001
0376 self.tolerance= 0.01
0377
0378 def checkBinningMatches(self):
0379 if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0380 or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0381 or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0382 or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0383 or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0384 or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0385 or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0386 or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0387 or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0388 return False
0389 return True
0390
0391 def do_test(self):
0392
0393 if not self.checkBinningMatches():
0394 return test_codes["DIFF_BIN"]
0395
0396 equal = 1
0397 nbins = getNbins(self.h1)
0398 n_ok_bins=0.0
0399 for ibin in range(0,nbins):
0400 ibin+=1
0401 h1bin=self.h1.GetBinContent(ibin)
0402 h2bin=self.h2.GetBinContent(ibin)
0403 bindiff=h1bin-h2bin
0404
0405 binavg=.5*(h1bin+h2bin)
0406
0407 if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
0408 n_ok_bins+=1
0409
0410 else:
0411 print("-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin))
0412
0413
0414
0415
0416 rank=n_ok_bins/nbins
0417
0418 if rank!=1:
0419 print("%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0420
0421 return rank
0422
0423 Statistical_Tests={"KS":KS,
0424 "Chi2":Chi2,
0425 "BinToBin":BinToBin,
0426 "BinToBin1percent":BinToBin1percent,
0427 "Bin2Bin":BinToBin,
0428 "b2b":BinToBin,}
0429
0430
0431 def ask_ok(prompt, retries=4, complaint='yes or no'):
0432 while True:
0433 ok = raw_input(prompt)
0434 if ok in ('y', 'ye', 'yes'):
0435 return True
0436 if ok in ('n', 'no'):
0437 return False
0438 retries = retries - 1
0439 if retries < 0:
0440 raise IOError('refusenik user')
0441 print(complaint)
0442
0443
0444
0445 class unpickler(Thread):
0446 def __init__(self,filename):
0447 Thread.__init__(self)
0448 self.filename=filename
0449 self.directory=""
0450
0451 def run(self):
0452 print("Reading directory from %s" %(self.filename))
0453 ifile=open(self.filename,"rb")
0454 self.directory=load(ifile)
0455 ifile.close()
0456
0457
0458
0459 def wget(url):
0460 """ Fetch the WHOLE file, not in bunches... To be optimised.
0461 """
0462 opener=build_opener(X509CertOpen())
0463 datareq = Request(url)
0464 datareq.add_header('authenticated_wget', "The ultimate wgetter")
0465 bin_content=None
0466 try:
0467 filename=basename(url)
0468 print("Checking existence of file %s on disk..."%filename)
0469 if not isfile("./%s"%filename):
0470 bin_content=opener.open(datareq).read()
0471 else:
0472 print("File %s exists, skipping.." %filename)
0473 except ValueError:
0474 print("Error: Unknown url %s" %url)
0475
0476 if bin_content!=None:
0477 ofile = open(filename, 'wb')
0478 ofile.write(bin_content)
0479 ofile.close()
0480
0481
0482
0483
0484 def get_relvaldata_id(file):
0485 """Returns unique relvaldata ID for a given file."""
0486 run_id = re.search('R\d{9}', file)
0487 run = re.search('_RelVal_([\w\d]*)-v\d__', file)
0488 if not run:
0489 run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
0490 if run_id and run:
0491 return (run_id.group(), run.group(1))
0492 return None
0493
0494 def get_relvaldata_cmssw_version(file):
0495 """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0496 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0497 gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
0498 if not gr_r_version:
0499 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
0500 if cmssw_release and gr_r_version:
0501 return (cmssw_release[0], gr_r_version[0])
0502
0503 def get_relvaldata_version(file):
0504 """Returns tuple (CMSSW version, run version) for specified file."""
0505 cmssw_version = re.findall('DQM_V(\d*)_', file)
0506 run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
0507 if not run_version:
0508 run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
0509 if cmssw_version and run_version:
0510 return (int(cmssw_version[0]), int(run_version[0]))
0511
0512 def get_relvaldata_max_version(files):
0513 """Returns file with maximum version at a) beggining of the file,
0514 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0515 max_file = files[0]
0516 max_v = get_relvaldata_version(files[0])
0517 for file in files:
0518 file_v = get_relvaldata_version(file)
0519 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0520 max_file = file
0521 max_v = file_v
0522 return max_file
0523
0524
0525 def get_relval_version(file):
0526 """Returns tuple (CMSSW version, run version) for specified file."""
0527 cmssw_version = re.findall('DQM_V(\d*)_', file)
0528 run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
0529 if cmssw_version and run_version:
0530 return (int(cmssw_version[0]), int(run_version[0]))
0531
0532 def get_relval_max_version(files):
0533 """Returns file with maximum version at a) beggining of the file,
0534 e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0535 max_file = files[0]
0536 max_v = get_relval_version(files[0])
0537 for file in files:
0538 file_v = get_relval_version(file)
0539 if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0540 max_file = file
0541 max_v = file_v
0542 return max_file
0543
0544 def get_relval_cmssw_version(file):
0545 cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0546 gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
0547 if cmssw_release and gr_r_version:
0548 if "PU" in gr_r_version[0][0] and not "FastSim" in file:
0549 __gt = re.sub('^[^_]*_', "", gr_r_version[0][0])
0550 __process_string = gr_r_version[0][1]
0551 return (__gt, __process_string)
0552 elif "PU" in gr_r_version[0][0] and "FastSim" in file:
0553 return (cmssw_release[0], "PU_")
0554 return (cmssw_release[0], gr_r_version[0])
0555
0556 def get_relval_id(file):
0557 """Returns unique relval ID (dataset name) for a given file."""
0558 dataset_name = re.findall('R\d{9}__([\w\D]*)__CMSSW_', file)
0559 __process_string = re.search('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
0560 _ps = ""
0561 if __process_string:
0562 if "PU" in __process_string.group(1) and not "FastSim" in file:
0563 _ps = re.search('^[^_]*_', __process_string.group(1)).group()
0564 elif "PU" in __process_string.group(1) and "FastSim" in file:
0565 return dataset_name[0]+"_", _ps
0566 return dataset_name[0], _ps
0567
0568
0569 def is_relvaldata(files):
0570 is_relvaldata_re = re.compile('_RelVal_')
0571 return any([is_relvaldata_re.search(filename) for filename in files])
0572
0573 def make_files_pairs(files, verbose=True):
0574
0575 if is_relvaldata(files):
0576 is_relval_data = True
0577 get_cmssw_version = get_relvaldata_cmssw_version
0578 get_id = get_relvaldata_id
0579 get_max_version = get_relvaldata_max_version
0580
0581 else:
0582 is_relval_data = False
0583 get_cmssw_version = get_relval_cmssw_version
0584 get_id = get_relval_id
0585 get_max_version = get_relval_max_version
0586
0587
0588
0589 versions_files = dict()
0590 for file in files:
0591 version = get_cmssw_version(file)
0592 if version in versions_files:
0593 versions_files[version].append(file)
0594 else:
0595 versions_files[version] = [file]
0596
0597
0598 if verbose:
0599 print('\nFound versions:')
0600 for version in versions_files:
0601 print('%s: %d files' % (str(version), len(versions_files[version])))
0602
0603 if len(versions_files) <= 1:
0604 print('\nFound too little versions, there is nothing to pair. Exiting...\n')
0605 exit()
0606
0607
0608 versions = versions_files.keys()
0609 sizes = [len(value) for value in versions_files.values()]
0610 v1 = versions[sizes.index(max(sizes))]
0611 versions.remove(v1)
0612 sizes.remove(max(sizes))
0613 v2 = versions[sizes.index(max(sizes))]
0614
0615
0616 if verbose:
0617 print('\nPairing %s (%d files) and %s (%d files)' % (str(v1),
0618 len(versions_files[v1]), str(v2), len(versions_files[v2])))
0619
0620
0621 print('\nGot pairs:')
0622 pairs = []
0623 for unique_id in set([get_id(file) for file in versions_files[v1]]):
0624 if is_relval_data:
0625 dataset_re = re.compile(unique_id[0]+'_')
0626 run_re = re.compile(unique_id[1])
0627 c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
0628 c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
0629 else:
0630 dataset_re = re.compile(unique_id[0]+'_')
0631 ps_re = re.compile(unique_id[1])
0632
0633 c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and ps_re.search(file)]
0634 c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and ps_re.search(file)]
0635
0636 if len(c1_files) > 0 and len(c2_files) > 0:
0637 first_file = get_max_version(c1_files)
0638 second_file = get_max_version(c2_files)
0639 print('%s\n%s\n' % (first_file, second_file))
0640 pairs.extend((first_file, second_file))
0641 if verbose:
0642 print("Paired and got %d files.\n" % len(pairs))
0643 return pairs