RelMon/python/utils.py

0001 from __future__ import print_function
0002 from __future__ import absolute_import
0003 ################################################################################
0004 # RelMon: a tool for automatic Release Comparison
0005 # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
0006 #
0007 #
0008 #
0009 # Danilo Piparo CERN - danilo.piparo@cern.ch
0010 #
0011 ################################################################################
0012
0013
0014 from builtins import range
0015 import array
0016 import os
0017 import re
0018 import sys
0019 from pickle import load
0020 from os.path import dirname,basename,join,isfile
0021 from threading import Thread
0022 from time import asctime
0023
0024 theargv=sys.argv
0025 sys.argv=[]
0026 import ROOT
0027 ROOT.gErrorIgnoreLevel=1001
0028 ROOT.gROOT.SetBatch(True)
0029 sys.argv=theargv
0030
0031 if sys.version_info[0]==2:
0032   from urllib2  import Request,build_opener,urlopen
0033 else:
0034   from urllib.request  import Request,build_opener,urlopen
0035
0036 if "RELMON_SA" in os.environ:
0037   from .definitions import *
0038   from .authentication import X509CertOpen
0039   from .utils import __file__ as this_module_name
0040 else:
0041   from Utilities.RelMon.definitions import *
0042   from Utilities.RelMon.authentication import X509CertOpen
0043   from Utilities.RelMon.utils import __file__ as this_module_name
0044
0045 #ROOT.gErrorIgnoreLevel=1001
0046
0047
0048 _log_level=10
0049 def logger(msg_level,message):
0050   if msg_level>=_log_level:
0051     print("[%s] %s" %(asctime(),message))
0052
0053 #-------------------------------------------------------------------------------
0054 def setTDRStyle():
0055   this_dir=dirname(this_module_name)
0056   this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
0057   #this_dir_two_up=this_dir_one_up[:this_dir_one_up.rfind("/")+1]
0058   style_file=''
0059   if "RELMON_SA" in os.environ:
0060     style_file=this_dir_one_up+"data/tdrstyle_mod.C"
0061   else:
0062     style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
0063   try:
0064     gROOT.ProcessLine(".L %s" %style_file)
0065     gROOT.ProcessLine("setTDRStyle()")
0066   except:
0067     "Print could not set the TDR style. File %s not found?" %style_file
0068
0069
0070 #-------------------------------------------------------------------------------
0071 def literal2root (literal,rootType):
0072   bitsarray = array.array('B')
0073   bitsarray.fromstring(literal.decode('hex'))
0074
0075   tbuffer=0
0076   try:
0077       tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
0078   except:
0079       print("could not transform to object array:")
0080       print([ i for i in  bitsarray ])
0081
0082   # replace a couple of shortcuts with the real root class name
0083   if rootType == 'TPROF':
0084       rootType = 'TProfile'
0085   if rootType == 'TPROF2D':
0086       rootType = 'TProfile2D'
0087
0088   root_class=eval(rootType+'.Class()')
0089
0090   return tbuffer.ReadObject(root_class)
0091
0092 #-------------------------------------------------------------------------------
0093
0094 def getNbins(h):
0095   """
0096   To be used in loops on bin number with range()
0097   For each dimension there are GetNbinsX()+2 bins including underflow
0098   and overflow, and range() loops starts from 0. So the total number
0099   of bins as upper limit of a range() loop already includes the next
0100   to last value needed.
0101   """
0102   biny=h.GetNbinsY()
0103   if biny>1: biny+=2
0104   binz=h.GetNbinsZ()
0105   if binz>1:binz+=2
0106   return (h.GetNbinsX()+2)*(biny)*(binz)
0107
0108 #-------------------------------------------------------------------------------
0109
0110
0111 class StatisticalTest(object):
0112   def __init__(self,threshold):
0113     self.name=""
0114     self.h1=None
0115     self.h2=None
0116     self.threshold=float(threshold)
0117     self.rank=-1
0118     self.is_init=False
0119
0120   def set_operands(self,h1,h2):
0121     self.h1=h1
0122     self.h2=h2
0123
0124   def get_rank(self):
0125     if not self.is_init:
0126       if self.rank < 0:
0127         type1=type(self.h1)
0128         type2=type(self.h2)
0129         if (type1 != type2):
0130           logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
0131           self.rank=test_codes["DIFF_TYPES"]
0132         elif not self.h2.InheritsFrom("TH1"):
0133           logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
0134           self.rank=test_codes["NO_HIST"]
0135         # if histos are empty
0136         #elif self.h1.InheritsFrom("TH2") and not "BinToBin" in self.name:
0137           ## 2D!
0138           #return test_codes["2D"]
0139         else:
0140           is_empty1=is_empty(self.h1)
0141           is_empty2=is_empty(self.h2)
0142           are_empty=is_empty1 and is_empty2
0143           one_empty=is_empty1 or is_empty2
0144
0145           Nbins1= getNbins(self.h1)
0146           Nbins2= getNbins(self.h2)
0147
0148           if are_empty:
0149             #return -103
0150             # Conversation with JeanRoch and David 5 April
0151             return 1
0152           elif one_empty:
0153             # Due conversation with Giovanni on 2015-09-10
0154             return 0
0155
0156           # if histos have different number of bins
0157           if Nbins1!=Nbins2:
0158             return test_codes["DIFF_BIN"]
0159
0160       self.rank=self.do_test()
0161     self.is_init=True
0162     return self.rank
0163
0164   def get_status(self):
0165     status = SUCCESS
0166     if self.get_rank()<0:
0167       status=NULL
0168       logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
0169     elif self.get_rank() < self.threshold:
0170       status=FAIL
0171     logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
0172     return status
0173
0174   def do_test(self):
0175     pass
0176
0177 #-------------------------------------------------------------------------------
0178
0179 def is_empty(h):
0180   for i in range(0,getNbins(h)):
0181     if h.GetBinContent(i)!=0: return False
0182   return True
0183   #return h.GetSumOfWeights()==0
0184
0185 #-------------------------------------------------------------------------------
0186
0187 def is_sparse(h):
0188   filled_bins=0.
0189   nbins=h.GetNbinsX()
0190   for ibin in range(0,nbins+2):
0191     if h.GetBinContent(ibin)>0:
0192       filled_bins+=1
0193   #print "%s %s --> %s" %(filled_bins,nbins,filled_bins/nbins)
0194   if filled_bins/nbins < .5:
0195     return True
0196   else:
0197     return False
0198
0199 #-------------------------------------------------------------------------------
0200
0201 class KS(StatisticalTest):
0202   def __init__(self, threshold):
0203    StatisticalTest.__init__(self,threshold)
0204    self.name="KS"
0205
0206   def do_test(self):
0207
0208     # Calculate errors if not there...
0209     for h in self.h1,self.h2:
0210       w2s=h.GetSumw2()
0211       if w2s.GetSize()==0:
0212         h.Sumw2()
0213
0214     ## If errors are 0:
0215     #zero_errors=True
0216     #for h in self.h1,self.h2:
0217       #for ibin in xrange(Nbins1):
0218         #if h.GetBinError(ibin+1) >0:
0219           #zero_errors=False
0220           #break
0221     #if zero_errors:
0222       #return test_codes["ZERO_ERR"]
0223
0224     return self.h1.KolmogorovTest(self.h2)
0225
0226 #-------------------------------------------------------------------------------
0227 import array
0228 def profile2histo(profile):
0229   if not profile.InheritsFrom("TH1"):
0230     return profile
0231
0232   bin_low_edges=[]
0233   n_bins=profile.GetNbinsX()
0234
0235   for ibin in range(1,n_bins+2):
0236     bin_low_edges.append(profile.GetBinLowEdge(ibin))
0237   bin_low_edges=array.array('f',bin_low_edges)
0238   histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
0239   for ibin in range(0,n_bins+2):
0240     histo.SetBinContent(ibin,profile.GetBinContent(ibin))
0241     histo.SetBinError(ibin,profile.GetBinError(ibin))
0242
0243   return histo
0244 #-------------------------------------------------------------------------------
0245
0246 class Chi2(StatisticalTest):
0247   def __init__(self, threshold):
0248     StatisticalTest.__init__(self,threshold)
0249     self.name="Chi2"
0250
0251   def check_filled_bins(self,min_filled):
0252     nbins=self.h1.GetNbinsX()
0253     n_filled_l=[]
0254     for h in self.h1,self.h2:
0255       nfilled=0.
0256       for ibin in range(0,nbins+2):
0257         if h.GetBinContent(ibin)>0:
0258           nfilled+=1
0259       n_filled_l.append(nfilled)
0260     return len([x for x in n_filled_l if x>=min_filled] )>0
0261
0262   def absval(self):
0263     nbins=getNbins(self.h1)
0264     binc=0
0265     for i in range(0,nbins):
0266       for h in self.h1,self.h2:
0267         binc=h.GetBinContent(i)
0268         if binc<0:
0269           h.SetBinContent(i,-1*binc)
0270         if h.GetBinError(i)==0 and binc!=0:
0271           #print "Histo ",h.GetName()," Bin:",i,"-Content:",h.GetBinContent(i)," had zero error"
0272           h.SetBinContent(i,0)
0273
0274   def check_histograms(self, histogram):
0275       if histogram.InheritsFrom("TProfile") or  (histogram.GetEntries()!=histogram.GetSumOfWeights()):
0276           return 'W'
0277       else:
0278           return 'U'
0279
0280   def do_test(self):
0281     self.absval()
0282     if self.check_filled_bins(3):
0283       #if self.h1.InheritsFrom("TProfile") or  (self.h1.GetEntries()!=self.h1.GetSumOfWeights()):
0284       #  chi2=self.h1.Chi2Test(self.h2,'WW')
0285       #  #if chi2==0: print "DEBUG",self.h1.GetName(),"Chi2 is:", chi2
0286       #  return chi2
0287       #else:
0288       #  return self.h1.Chi2Test(self.h2,'UU')
0289       hist1 = self.check_histograms(self.h1)
0290       hist2 = self.check_histograms(self.h2)
0291       if hist1 =='W' and hist2 =='W': ##in case
0292           chi2 = self.h1.Chi2Test(self.h2,'WW')     ## the both histograms are weighted
0293           return chi2
0294       elif hist1 == 'U' and hist2 == 'U':
0295           chi2 = self.h1.Chi2Test(self.h2,'UU')    ##the both histograms are unweighted
0296           return chi2
0297       elif hist1 == 'U' and hist2 == 'W':
0298           chi2 = self.h1.Chi2Test(self.h2,'UW')   ## 1st histogram is unweighted, 2nd weighted
0299           return chi2
0300       elif hist1 == 'W' and hist2 == 'U':
0301           chi2 = self.h2.Chi2Test(self.h1,'UW')   ## 1 is wieghted, 2nd unweigthed. so flip order to make a UW comparison
0302           return chi2
0303     else:
0304       return 1
0305       #return test_codes["FEW_BINS"]
0306
0307 #-------------------------------------------------------------------------------
0308
0309 class BinToBin(StatisticalTest):
0310   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
0311   bins is different. It is % of corresponding bins otherwhise.
0312   A threshold of 1 is needed to require a 1 to 1 correspondance between
0313   hisograms.
0314   """
0315   def __init__(self, threshold=1):
0316     StatisticalTest.__init__(self, threshold)
0317     self.name='BinToBin'
0318     self.epsilon= 0.000001
0319
0320   def checkBinningMatches(self):
0321     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0322            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0323            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0324            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0325            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0326            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0327            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0328            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0329            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0330       return False
0331     return True
0332
0333   def do_test(self):
0334     # fist check that binning matches
0335     if not self.checkBinningMatches():
0336       return test_codes["DIFF_BIN"]
0337     # then do the real check
0338     equal = 1
0339     nbins = getNbins(self.h1)
0340     n_ok_bins=0.0
0341     for ibin in range(0, nbins):
0342       h1bin=self.h1.GetBinContent(ibin)
0343       h2bin=self.h2.GetBinContent(ibin)
0344       bindiff=h1bin-h2bin
0345
0346       binavg=.5*(h1bin+h2bin)
0347
0348       if binavg==0 or abs(bindiff) < self.epsilon:
0349         n_ok_bins+=1
0350         #print("Bin %ibin: bindiff %s" %(ibin,bindiff))
0351       else:
0352         print("Bin %ibin: bindiff %s" %(ibin,bindiff))
0353
0354       #if abs(bindiff)!=0 :
0355         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
0356
0357     rank=n_ok_bins/nbins
0358
0359     if rank!=1:
0360       print("Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0361
0362     return rank
0363
0364 #-------------------------------------------------------------------------------
0365
0366 class BinToBin1percent(StatisticalTest):
0367   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of
0368   bins is different. It is % of corresponding bins otherwhise.
0369   A threshold of 1 is needed to require a 1 to 1 correspondance between
0370   hisograms.
0371   """
0372   def __init__(self, threshold=1):
0373     StatisticalTest.__init__(self, threshold)
0374     self.name='BinToBin1percent'
0375     self.epsilon= 0.000001
0376     self.tolerance= 0.01
0377
0378   def checkBinningMatches(self):
0379     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0380            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0381            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0382            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0383            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0384            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0385            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0386            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0387            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0388       return False
0389     return True
0390
0391   def do_test(self):
0392     # fist check that binning matches
0393     if not self.checkBinningMatches():
0394       return test_codes["DIFF_BIN"]
0395     # then do the real check
0396     equal = 1
0397     nbins = getNbins(self.h1)
0398     n_ok_bins=0.0
0399     for ibin in range(0,nbins):
0400       ibin+=1
0401       h1bin=self.h1.GetBinContent(ibin)
0402       h2bin=self.h2.GetBinContent(ibin)
0403       bindiff=h1bin-h2bin
0404
0405       binavg=.5*(h1bin+h2bin)
0406
0407       if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
0408         n_ok_bins+=1
0409         #print "Bin %i bin: bindiff %s" %(ibin,bindiff)
0410       else:
0411         print("-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin))
0412
0413       #if abs(bindiff)!=0 :
0414         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
0415
0416     rank=n_ok_bins/nbins
0417
0418     if rank!=1:
0419       print("%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0420
0421     return rank
0422 #-------------------------------------------------------------------------------
0423 Statistical_Tests={"KS":KS,
0424                    "Chi2":Chi2,
0425                    "BinToBin":BinToBin,
0426                    "BinToBin1percent":BinToBin1percent,
0427                    "Bin2Bin":BinToBin,
0428                    "b2b":BinToBin,}
0429 #-------------------------------------------------------------------------------
0430
0431 def ask_ok(prompt, retries=4, complaint='yes or no'):
0432     while True:
0433         ok = raw_input(prompt)
0434         if ok in ('y', 'ye', 'yes'):
0435             return True
0436         if ok in ('n', 'no'):
0437             return False
0438         retries = retries - 1
0439         if retries < 0:
0440             raise IOError('refusenik user')
0441         print(complaint)
0442
0443 #-------------------------------------------------------------------------------
0444
0445 class unpickler(Thread):
0446   def __init__(self,filename):
0447     Thread.__init__(self)
0448     self.filename=filename
0449     self.directory=""
0450
0451   def run(self):
0452     print("Reading directory from %s" %(self.filename))
0453     ifile=open(self.filename,"rb")
0454     self.directory=load(ifile)
0455     ifile.close()
0456
0457 #-------------------------------------------------------------------------------
0458
0459 def wget(url):
0460   """ Fetch the WHOLE file, not in bunches... To be optimised.
0461   """
0462   opener=build_opener(X509CertOpen())
0463   datareq = Request(url)
0464   datareq.add_header('authenticated_wget', "The ultimate wgetter")
0465   bin_content=None
0466   try:
0467     filename=basename(url)
0468     print("Checking existence of file %s on disk..."%filename)
0469     if not isfile("./%s"%filename):
0470       bin_content=opener.open(datareq).read()
0471     else:
0472       print("File %s exists, skipping.." %filename)
0473   except ValueError:
0474     print("Error: Unknown url %s" %url)
0475
0476   if bin_content!=None:
0477     ofile = open(filename, 'wb')
0478     ofile.write(bin_content)
0479     ofile.close()
0480
0481 #-------------------------------------------------------------------------------
0482 ##-----------------   Make files pairs:  RelValData utils   --------------------
0483
0484 def get_relvaldata_id(file):
0485     """Returns unique relvaldata ID for a given file."""
0486     run_id = re.search('R\d{9}', file)
0487     run = re.search('_RelVal_([\w\d]*)-v\d__', file)
0488     if not run:
0489         run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
0490     if run_id and run:
0491         return (run_id.group(), run.group(1))
0492     return None
0493
0494 def get_relvaldata_cmssw_version(file):
0495     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0496     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0497     gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
0498     if not gr_r_version:
0499         gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
0500     if cmssw_release and gr_r_version:
0501         return (cmssw_release[0], gr_r_version[0])
0502
0503 def get_relvaldata_version(file):
0504     """Returns tuple (CMSSW version, run version) for specified file."""
0505     cmssw_version = re.findall('DQM_V(\d*)_', file)
0506     run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
0507     if not run_version:
0508         run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
0509     if cmssw_version and run_version:
0510         return (int(cmssw_version[0]), int(run_version[0]))
0511
0512 def get_relvaldata_max_version(files):
0513     """Returns file with maximum version at a) beggining of the file,
0514     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0515     max_file = files[0]
0516     max_v = get_relvaldata_version(files[0])
0517     for file in files:
0518         file_v = get_relvaldata_version(file)
0519         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0520             max_file = file
0521             max_v = file_v
0522     return max_file
0523
0524 ##-------------------   Make files pairs:  RelVal utils   ---------------------
0525 def get_relval_version(file):
0526     """Returns tuple (CMSSW version, run version) for specified file."""
0527     cmssw_version = re.findall('DQM_V(\d*)_', file)
0528     run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
0529     if cmssw_version and run_version:
0530         return (int(cmssw_version[0]), int(run_version[0]))
0531
0532 def get_relval_max_version(files):
0533     """Returns file with maximum version at a) beggining of the file,
0534     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0535     max_file = files[0]
0536     max_v = get_relval_version(files[0])
0537     for file in files:
0538         file_v = get_relval_version(file)
0539         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0540             max_file = file
0541             max_v = file_v
0542     return max_file
0543
0544 def get_relval_cmssw_version(file):
0545     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
0546     gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
0547     if cmssw_release and gr_r_version:
0548         if "PU" in gr_r_version[0][0] and not "FastSim" in file:
0549             __gt = re.sub('^[^_]*_', "", gr_r_version[0][0])
0550             __process_string = gr_r_version[0][1]
0551             return (__gt, __process_string)
0552         elif "PU" in gr_r_version[0][0] and "FastSim" in file:   #a check for FastSimPU samples
0553             return (cmssw_release[0], "PU_")                     #with possibly different GT's
0554         return (cmssw_release[0], gr_r_version[0])
0555
0556 def get_relval_id(file):
0557     """Returns unique relval ID (dataset name) for a given file."""
0558     dataset_name = re.findall('R\d{9}__([\w\D]*)__CMSSW_', file)
0559     __process_string = re.search('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
0560     _ps = ""
0561     if __process_string:
0562         if "PU" in __process_string.group(1) and not "FastSim" in file:
0563             _ps = re.search('^[^_]*_', __process_string.group(1)).group()
0564         elif "PU" in __process_string.group(1) and "FastSim" in file:
0565             return dataset_name[0]+"_", _ps ##some testing is needed
0566     return dataset_name[0], _ps
0567
0568 ##-------------------------  Make files pairs --------------------------
0569 def is_relvaldata(files):
0570     is_relvaldata_re = re.compile('_RelVal_')
0571     return any([is_relvaldata_re.search(filename) for filename in files])
0572
0573 def make_files_pairs(files, verbose=True):
0574     ## Select functions to use
0575     if is_relvaldata(files):
0576         is_relval_data = True
0577         get_cmssw_version = get_relvaldata_cmssw_version
0578         get_id = get_relvaldata_id
0579         get_max_version = get_relvaldata_max_version
0580         # print 'Pairing Data RelVal files.'
0581     else:
0582         is_relval_data = False
0583         get_cmssw_version = get_relval_cmssw_version
0584         get_id = get_relval_id
0585         get_max_version = get_relval_max_version
0586         # print 'Pairing Monte Carlo RelVal files.'
0587
0588     ## Divide files into groups
0589     versions_files = dict()
0590     for file in files:
0591         version = get_cmssw_version(file)
0592         if version in versions_files:
0593             versions_files[version].append(file)
0594         else:
0595             versions_files[version] = [file]
0596
0597     ## Print the division into groups
0598     if verbose:
0599         print('\nFound versions:')
0600         for version in versions_files:
0601             print('%s: %d files' % (str(version),  len(versions_files[version])))
0602
0603     if len(versions_files) <= 1:
0604         print('\nFound too little versions, there is nothing to pair. Exiting...\n')
0605         exit()
0606
0607     ## Select two biggest groups.
0608     versions = versions_files.keys()
0609     sizes = [len(value) for value in versions_files.values()]
0610     v1 = versions[sizes.index(max(sizes))]
0611     versions.remove(v1)
0612     sizes.remove(max(sizes))
0613     v2 = versions[sizes.index(max(sizes))]
0614
0615     ## Print two biggest groups.
0616     if verbose:
0617         print('\nPairing %s (%d files) and %s (%d files)' % (str(v1),
0618                 len(versions_files[v1]), str(v2), len(versions_files[v2])))
0619
0620     ## Pairing two versions
0621     print('\nGot pairs:')
0622     pairs = []
0623     for unique_id in set([get_id(file) for file in versions_files[v1]]):
0624         if is_relval_data:
0625             dataset_re = re.compile(unique_id[0]+'_')
0626             run_re = re.compile(unique_id[1])
0627             c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
0628             c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
0629         else:
0630             dataset_re = re.compile(unique_id[0]+'_')
0631             ps_re = re.compile(unique_id[1])
0632             ##compile a PU re and search also for same PU
0633             c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and ps_re.search(file)]
0634             c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and ps_re.search(file)]
0635
0636         if len(c1_files) > 0 and len(c2_files) > 0:
0637             first_file = get_max_version(c1_files)
0638             second_file = get_max_version(c2_files)
0639             print('%s\n%s\n' % (first_file, second_file))
0640             pairs.extend((first_file, second_file))
0641     if verbose:
0642         print("Paired and got %d files.\n" % len(pairs))
0643     return pairs