Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-26 02:34:35

0001 ################################################################################
0002 # RelMon: a tool for automatic Release Comparison                              
0003 # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
0004 #
0005 #
0006 #                                                                              
0007 # Danilo Piparo CERN - danilo.piparo@cern.ch                                   
0008 #                                                                              
0009 ################################################################################
0010 
0011 
0012 from builtins import range
0013 import array
0014 import os
0015 import re
0016 import sys
0017 from pickle import load
0018 from os.path import dirname,basename,join,isfile
0019 from threading import Thread
0020 from time import asctime
0021 
0022 theargv=sys.argv
0023 sys.argv=[]
0024 import ROOT
0025 ROOT.gErrorIgnoreLevel=1001
0026 ROOT.gROOT.SetBatch(True)
0027 sys.argv=theargv
0028 
0029 if sys.version_info[0]==2:
0030   from urllib2  import Request,build_opener,urlopen
0031 else:
0032   from urllib.request  import Request,build_opener,urlopen
0033 
0034 if "RELMON_SA" in os.environ:
0035   from .definitions import *
0036   from .authentication import X509CertOpen
0037   from .utils import __file__ as this_module_name  
0038 else:
0039   from Utilities.RelMon.definitions import *
0040   from Utilities.RelMon.authentication import X509CertOpen
0041   from Utilities.RelMon.utils import __file__ as this_module_name
0042 
0043 #ROOT.gErrorIgnoreLevel=1001
0044 
0045 
0046 _log_level=10
0047 def logger(msg_level,message):
0048   if msg_level>=_log_level:
0049     print("[%s] %s" %(asctime(),message))
0050 
0051 #-------------------------------------------------------------------------------
0052 def setTDRStyle():  
0053   this_dir=dirname(this_module_name)
0054   this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
0055   #this_dir_two_up=this_dir_one_up[:this_dir_one_up.rfind("/")+1]
0056   style_file=''
0057   if "RELMON_SA" in os.environ:
0058     style_file=this_dir_one_up+"data/tdrstyle_mod.C"
0059   else:
0060     style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
0061   try:
0062     gROOT.ProcessLine(".L %s" %style_file)
0063     gROOT.ProcessLine("setTDRStyle()")
0064   except:
0065     "Print could not set the TDR style. File %s not found?" %style_file
0066     
0067 
0068 #-------------------------------------------------------------------------------
0069 def literal2root (literal,rootType):
0070   bitsarray = array.array('B')
0071   bitsarray.fromstring(literal.decode('hex'))
0072 
0073   tbuffer=0
0074   try:  
0075       tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
0076   except:
0077       print("could not transform to object array:")
0078       print([ i for i in  bitsarray ])
0079   
0080   # replace a couple of shortcuts with the real root class name
0081   if rootType == 'TPROF':
0082       rootType = 'TProfile'
0083   if rootType == 'TPROF2D':
0084       rootType = 'TProfile2D'
0085   
0086   root_class=eval(rootType+'.Class()')
0087   
0088   return tbuffer.ReadObject(root_class)
0089   
0090 #-------------------------------------------------------------------------------
0091 
0092 def getNbins(h):
0093   """
0094   To be used in loops on bin number with range()
0095   For each dimension there are GetNbinsX()+2 bins including underflow 
0096   and overflow, and range() loops starts from 0. So the total number
0097   of bins as upper limit of a range() loop already includes the next 
0098   to last value needed.
0099   """
0100   biny=h.GetNbinsY()
0101   if biny>1: biny+=2
0102   binz=h.GetNbinsZ()
0103   if binz>1:binz+=2
0104   return (h.GetNbinsX()+2)*(biny)*(binz)
0105 
0106 #-------------------------------------------------------------------------------
0107 
0108 
0109 class StatisticalTest(object):
0110   def __init__(self,threshold):
0111     self.name=""
0112     self.h1=None
0113     self.h2=None
0114     self.threshold=float(threshold)
0115     self.rank=-1
0116     self.is_init=False
0117 
0118   def set_operands(self,h1,h2):
0119     self.h1=h1
0120     self.h2=h2
0121 
0122   def get_rank(self):
0123     if not self.is_init:
0124       if self.rank < 0:
0125         type1=type(self.h1)
0126         type2=type(self.h2)
0127         if (type1 != type2):
0128           logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
0129           self.rank=test_codes["DIFF_TYPES"]
0130         elif not self.h2.InheritsFrom("TH1"):
0131           logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
0132           self.rank=test_codes["NO_HIST"]    
0133         # if histos are empty
0134         #elif self.h1.InheritsFrom("TH2") and not "BinToBin" in self.name:
0135           ## 2D!
0136           #return test_codes["2D"]
0137         else:
0138           is_empty1=is_empty(self.h1)
0139           is_empty2=is_empty(self.h2)
0140           are_empty=is_empty1 and is_empty2
0141           one_empty=is_empty1 or is_empty2
0142 
0143           Nbins1= getNbins(self.h1)
0144           Nbins2= getNbins(self.h2)
0145 
0146           if are_empty:
0147             #return -103
0148             # Conversation with JeanRoch and David 5 April
0149             return 1
0150           elif one_empty:
0151             # Due conversation with Giovanni on 2015-09-10
0152             return 0
0153 
0154           # if histos have different number of bins
0155           if Nbins1!=Nbins2:
0156             return test_codes["DIFF_BIN"]
0157 
0158       self.rank=self.do_test()
0159     self.is_init=True
0160     return self.rank
0161 
0162   def get_status(self):
0163     status = SUCCESS
0164     if self.get_rank()<0:
0165       status=NULL
0166       logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))  
0167     elif self.get_rank() < self.threshold:
0168       status=FAIL
0169     logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
0170     return status
0171 
0172   def do_test(self):
0173     pass
0174 
0175 #-------------------------------------------------------------------------------
0176 
0177 def is_empty(h):
0178   for i in range(0,getNbins(h)):
0179     if h.GetBinContent(i)!=0: return False
0180   return True
0181   #return h.GetSumOfWeights()==0
0182 
0183 #-------------------------------------------------------------------------------
0184 
0185 def is_sparse(h):
0186   filled_bins=0.
0187   nbins=h.GetNbinsX()
0188   for ibin in range(0,nbins+2):
0189     if h.GetBinContent(ibin)>0:
0190       filled_bins+=1
0191   #print "%s %s --> %s" %(filled_bins,nbins,filled_bins/nbins)
0192   if filled_bins/nbins < .5:
0193     return True
0194   else:
0195     return False
0196 
0197 #-------------------------------------------------------------------------------
0198 
0199 class KS(StatisticalTest):
0200   def __init__(self, threshold):
0201    StatisticalTest.__init__(self,threshold)
0202    self.name="KS"
0203 
0204   def do_test(self):
0205 
0206     # Calculate errors if not there...
0207     for h in self.h1,self.h2:
0208       w2s=h.GetSumw2()
0209       if w2s.GetSize()==0:
0210         h.Sumw2()
0211 
0212     ## If errors are 0:
0213     #zero_errors=True
0214     #for h in self.h1,self.h2:
0215       #for ibin in xrange(Nbins1):
0216         #if h.GetBinError(ibin+1) >0:
0217           #zero_errors=False
0218           #break
0219     #if zero_errors:
0220       #return test_codes["ZERO_ERR"]
0221 
0222     return self.h1.KolmogorovTest(self.h2)
0223 
0224 #-------------------------------------------------------------------------------
0225 import array
0226 def profile2histo(profile):
0227   if not profile.InheritsFrom("TH1"):
0228     return profile
0229     
0230   bin_low_edges=[]
0231   n_bins=profile.GetNbinsX()
0232   
0233   for ibin in range(1,n_bins+2):
0234     bin_low_edges.append(profile.GetBinLowEdge(ibin))
0235   bin_low_edges=array.array('f',bin_low_edges)
0236   histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
0237   for ibin in range(0,n_bins+2):
0238     histo.SetBinContent(ibin,profile.GetBinContent(ibin))
0239     histo.SetBinError(ibin,profile.GetBinError(ibin))    
0240   
0241   return histo
0242 #-------------------------------------------------------------------------------
0243 
0244 class Chi2(StatisticalTest):
0245   def __init__(self, threshold):
0246     StatisticalTest.__init__(self,threshold)
0247     self.name="Chi2"     
0248 
0249   def check_filled_bins(self,min_filled):
0250     nbins=self.h1.GetNbinsX()
0251     n_filled_l=[]
0252     for h in self.h1,self.h2:
0253       nfilled=0.
0254       for ibin in range(0,nbins+2):
0255         if h.GetBinContent(ibin)>0:
0256           nfilled+=1
0257       n_filled_l.append(nfilled)
0258     return len([x for x in n_filled_l if x>=min_filled] )>0
0259 
0260   def absval(self):
0261     nbins=getNbins(self.h1)
0262     binc=0
0263     for i in range(0,nbins):
0264       for h in self.h1,self.h2:
0265         binc=h.GetBinContent(i)
0266         if binc<0:
0267           h.SetBinContent(i,-1*binc)
0268         if h.GetBinError(i)==0 and binc!=0:
0269           #print "Histo ",h.GetName()," Bin:",i,"-Content:",h.GetBinContent(i)," had zero error"
0270           h.SetBinContent(i,0)
0271 
0272   def check_histograms(self, histogram):
0273       if histogram.InheritsFrom("TProfile") or  (histogram.GetEntries()!=histogram.GetSumOfWeights()):
0274           return 'W'
0275       else:
0276           return 'U'
0277 
0278   def do_test(self):
0279     self.absval()
0280     if self.check_filled_bins(3):
0281       #if self.h1.InheritsFrom("TProfile") or  (self.h1.GetEntries()!=self.h1.GetSumOfWeights()):
0282       #  chi2=self.h1.Chi2Test(self.h2,'WW')
0283       #  #if chi2==0: print "DEBUG",self.h1.GetName(),"Chi2 is:", chi2
0284       #  return chi2
0285       #else:
0286       #  return self.h1.Chi2Test(self.h2,'UU')
0287       hist1 = self.check_histograms(self.h1)
0288       hist2 = self.check_histograms(self.h2)
0289       if hist1 =='W' and hist2 =='W': ##in case 
0290           chi2 = self.h1.Chi2Test(self.h2,'WW')     ## the both histograms are weighted
0291           return chi2
0292       elif hist1 == 'U' and hist2 == 'U':
0293           chi2 = self.h1.Chi2Test(self.h2,'UU')    ##the both histograms are unweighted
0294           return chi2
0295       elif hist1 == 'U' and hist2 == 'W':
0296           chi2 = self.h1.Chi2Test(self.h2,'UW')   ## 1st histogram is unweighted, 2nd weighted
0297           return chi2
0298       elif hist1 == 'W' and hist2 == 'U':
0299           chi2 = self.h2.Chi2Test(self.h1,'UW')   ## 1 is wieghted, 2nd unweigthed. so flip order to make a UW comparison
0300           return chi2
0301     else:
0302       return 1
0303       #return test_codes["FEW_BINS"]
0304 
0305 #-------------------------------------------------------------------------------
0306 
0307 class BinToBin(StatisticalTest):
0308   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
0309   bins is different. It is % of corresponding bins otherwhise.
0310   A threshold of 1 is needed to require a 1 to 1 correspondance between 
0311   hisograms.
0312   """
0313   def __init__(self, threshold=1):
0314     StatisticalTest.__init__(self, threshold)
0315     self.name='BinToBin'
0316     self.epsilon= 0.000001
0317 
0318   def checkBinningMatches(self):
0319     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0320            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0321            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0322            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0323            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0324            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0325            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0326            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0327            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0328       return False
0329     return True
0330 
0331   def do_test(self):
0332     # fist check that binning matches
0333     if not self.checkBinningMatches():
0334       return test_codes["DIFF_BIN"]
0335     # then do the real check
0336     equal = 1
0337     nbins = getNbins(self.h1)
0338     n_ok_bins=0.0
0339     for ibin in range(0, nbins):
0340       h1bin=self.h1.GetBinContent(ibin)
0341       h2bin=self.h2.GetBinContent(ibin)
0342       bindiff=h1bin-h2bin
0343 
0344       binavg=.5*(h1bin+h2bin)
0345 
0346       if binavg==0 or abs(bindiff) < self.epsilon:
0347         n_ok_bins+=1
0348         #print("Bin %ibin: bindiff %s" %(ibin,bindiff))
0349       else:
0350         print("Bin %ibin: bindiff %s" %(ibin,bindiff))
0351 
0352       #if abs(bindiff)!=0 :
0353         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
0354     
0355     rank=n_ok_bins/nbins
0356     
0357     if rank!=1:
0358       print("Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0359     
0360     return rank
0361 
0362 #-------------------------------------------------------------------------------
0363 
0364 class BinToBin1percent(StatisticalTest):
0365   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
0366   bins is different. It is % of corresponding bins otherwhise.
0367   A threshold of 1 is needed to require a 1 to 1 correspondance between 
0368   hisograms.
0369   """
0370   def __init__(self, threshold=1):
0371     StatisticalTest.__init__(self, threshold)
0372     self.name='BinToBin1percent'
0373     self.epsilon= 0.000001
0374     self.tolerance= 0.01
0375 
0376   def checkBinningMatches(self):
0377     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
0378            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
0379            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
0380            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
0381            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
0382            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
0383            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
0384            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
0385            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
0386       return False
0387     return True
0388 
0389   def do_test(self):
0390     # fist check that binning matches
0391     if not self.checkBinningMatches():
0392       return test_codes["DIFF_BIN"]
0393     # then do the real check
0394     equal = 1
0395     nbins = getNbins(self.h1)
0396     n_ok_bins=0.0
0397     for ibin in range(0,nbins):
0398       ibin+=1
0399       h1bin=self.h1.GetBinContent(ibin)
0400       h2bin=self.h2.GetBinContent(ibin)
0401       bindiff=h1bin-h2bin
0402 
0403       binavg=.5*(h1bin+h2bin)
0404 
0405       if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
0406         n_ok_bins+=1
0407         #print "Bin %i bin: bindiff %s" %(ibin,bindiff)
0408       else:
0409         print("-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin))
0410 
0411       #if abs(bindiff)!=0 :
0412         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
0413     
0414     rank=n_ok_bins/nbins
0415     
0416     if rank!=1:
0417       print("%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins))
0418     
0419     return rank
0420 #-------------------------------------------------------------------------------
0421 Statistical_Tests={"KS":KS,
0422                    "Chi2":Chi2,
0423                    "BinToBin":BinToBin,
0424                    "BinToBin1percent":BinToBin1percent,
0425                    "Bin2Bin":BinToBin,
0426                    "b2b":BinToBin,}
0427 #-------------------------------------------------------------------------------  
0428 
0429 def ask_ok(prompt, retries=4, complaint='yes or no'):
0430     while True:
0431         ok = raw_input(prompt)
0432         if ok in ('y', 'ye', 'yes'):
0433             return True
0434         if ok in ('n', 'no'):
0435             return False
0436         retries = retries - 1
0437         if retries < 0:
0438             raise IOError('refusenik user')
0439         print(complaint)
0440 
0441 #-------------------------------------------------------------------------------
0442 
0443 class unpickler(Thread):
0444   def __init__(self,filename):
0445     Thread.__init__(self)
0446     self.filename=filename
0447     self.directory=""
0448 
0449   def run(self):
0450     print("Reading directory from %s" %(self.filename))
0451     ifile=open(self.filename,"rb")
0452     self.directory=load(ifile) 
0453     ifile.close()
0454 
0455 #-------------------------------------------------------------------------------    
0456 
0457 def wget(url):
0458   """ Fetch the WHOLE file, not in bunches... To be optimised.
0459   """
0460   opener=build_opener(X509CertOpen())  
0461   datareq = Request(url)
0462   datareq.add_header('authenticated_wget', "The ultimate wgetter")    
0463   bin_content=None
0464   try:
0465     filename=basename(url)  
0466     print("Checking existence of file %s on disk..."%filename)
0467     if not isfile("./%s"%filename):      
0468       bin_content=opener.open(datareq).read()
0469     else:
0470       print("File %s exists, skipping.." %filename)
0471   except ValueError:
0472     print("Error: Unknown url %s" %url)
0473   
0474   if bin_content!=None:  
0475     ofile = open(filename, 'wb')
0476     ofile.write(bin_content)
0477     ofile.close()
0478 
0479 #-------------------------------------------------------------------------------
0480 ##-----------------   Make files pairs:  RelValData utils   --------------------
0481 
0482 def get_relvaldata_id(file):
0483     """Returns unique relvaldata ID for a given file."""
0484     run_id = re.search('R\\d{9}', file)
0485     run = re.search('_RelVal_([\\w\\d]*)-v\\d__', file)
0486     if not run:
0487         run = re.search('GR_R_\\d*_V\\d*C?_([\\w\\d]*)-v\\d__', file)
0488     if run_id and run:
0489         return (run_id.group(), run.group(1))
0490     return None
0491 
0492 def get_relvaldata_cmssw_version(file):
0493     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
0494     cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0495     gr_r_version = re.findall('-(GR_R_\\d*_V\\d*\\w?)(?:_RelVal)?_', file)
0496     if not gr_r_version:
0497         gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-(\\w*)_RelVal_', file)
0498     if cmssw_release and gr_r_version:
0499         return (cmssw_release[0], gr_r_version[0])
0500 
0501 def get_relvaldata_version(file):
0502     """Returns tuple (CMSSW version, run version) for specified file."""
0503     cmssw_version = re.findall('DQM_V(\\d*)_', file)
0504     run_version = re.findall('_RelVal_[\\w\\d]*-v(\\d)__', file)
0505     if not run_version:
0506         run_version = re.findall('GR_R_\\d*_V\\d*C?_[\\w\\d]*-v(\\d)__', file)
0507     if cmssw_version and run_version:
0508         return (int(cmssw_version[0]), int(run_version[0]))
0509 
0510 def get_relvaldata_max_version(files):
0511     """Returns file with maximum version at a) beggining of the file,
0512     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0513     max_file = files[0]
0514     max_v = get_relvaldata_version(files[0])
0515     for file in files:
0516         file_v = get_relvaldata_version(file)
0517         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0518             max_file = file
0519             max_v = file_v
0520     return max_file
0521 
0522 ##-------------------   Make files pairs:  RelVal utils   ---------------------
0523 def get_relval_version(file):
0524     """Returns tuple (CMSSW version, run version) for specified file."""
0525     cmssw_version = re.findall('DQM_V(\\d*)_', file)
0526     run_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-[\\w\\d]*_V\\d*\\w?(?:_[\\w\\d]*)?-v(\\d*)__', file)
0527     if cmssw_version and run_version:
0528         return (int(cmssw_version[0]), int(run_version[0]))
0529 
0530 def get_relval_max_version(files):
0531     """Returns file with maximum version at a) beggining of the file,
0532     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
0533     max_file = files[0]
0534     max_v = get_relval_version(files[0])
0535     for file in files:
0536         file_v = get_relval_version(file)
0537         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
0538             max_file = file
0539             max_v = file_v
0540     return max_file
0541 
0542 def get_relval_cmssw_version(file):
0543     cmssw_release = re.findall('(CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?)-', file)
0544     gr_r_version = re.findall('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-([\\w\\d]*)_V\\d*\\w?(_[\\w\\d]*)?-v', file)
0545     if cmssw_release and gr_r_version:
0546         if "PU" in gr_r_version[0][0] and not "FastSim" in file:
0547             __gt = re.sub('^[^_]*_', "", gr_r_version[0][0])
0548             __process_string = gr_r_version[0][1]
0549             return (__gt, __process_string)
0550         elif "PU" in gr_r_version[0][0] and "FastSim" in file:   #a check for FastSimPU samples
0551             return (cmssw_release[0], "PU_")                     #with possibly different GT's
0552         return (cmssw_release[0], gr_r_version[0])
0553 
0554 def get_relval_id(file):
0555     """Returns unique relval ID (dataset name) for a given file."""
0556     dataset_name = re.findall('R\\d{9}__([\\w\\D]*)__CMSSW_', file)
0557     __process_string = re.search('CMSSW_\\d*_\\d*_\\d*(?:_[\\w\\d]*)?-([\\w\\d]*)_V\\d*\\w?(_[\\w\\d]*)?-v', file)
0558     _ps = ""
0559     if __process_string:
0560         if "PU" in __process_string.group(1) and not "FastSim" in file:
0561             _ps = re.search('^[^_]*_', __process_string.group(1)).group()
0562         elif "PU" in __process_string.group(1) and "FastSim" in file:
0563             return dataset_name[0]+"_", _ps ##some testing is needed
0564     return dataset_name[0], _ps
0565 
0566 ##-------------------------  Make files pairs --------------------------
0567 def is_relvaldata(files):
0568     is_relvaldata_re = re.compile('_RelVal_')
0569     return any([is_relvaldata_re.search(filename) for filename in files])
0570 
0571 def make_files_pairs(files, verbose=True):
0572     ## Select functions to use
0573     if is_relvaldata(files):
0574         is_relval_data = True
0575         get_cmssw_version = get_relvaldata_cmssw_version
0576         get_id = get_relvaldata_id
0577         get_max_version = get_relvaldata_max_version
0578         # print 'Pairing Data RelVal files.'
0579     else:
0580         is_relval_data = False
0581         get_cmssw_version = get_relval_cmssw_version
0582         get_id = get_relval_id
0583         get_max_version = get_relval_max_version
0584         # print 'Pairing Monte Carlo RelVal files.'
0585 
0586     ## Divide files into groups
0587     versions_files = dict()
0588     for file in files:
0589         version = get_cmssw_version(file)
0590         if version in versions_files:
0591             versions_files[version].append(file)
0592         else:
0593             versions_files[version] = [file]
0594 
0595     ## Print the division into groups
0596     if verbose:
0597         print('\nFound versions:')
0598         for version in versions_files:
0599             print('%s: %d files' % (str(version),  len(versions_files[version])))
0600 
0601     if len(versions_files) <= 1:
0602         print('\nFound too little versions, there is nothing to pair. Exiting...\n')
0603         exit()
0604 
0605     ## Select two biggest groups.
0606     versions = versions_files.keys()
0607     sizes = [len(value) for value in versions_files.values()]
0608     v1 = versions[sizes.index(max(sizes))]
0609     versions.remove(v1)
0610     sizes.remove(max(sizes))
0611     v2 = versions[sizes.index(max(sizes))]
0612 
0613     ## Print two biggest groups.
0614     if verbose:
0615         print('\nPairing %s (%d files) and %s (%d files)' % (str(v1),
0616                 len(versions_files[v1]), str(v2), len(versions_files[v2])))
0617 
0618     ## Pairing two versions
0619     print('\nGot pairs:')
0620     pairs = []
0621     for unique_id in set([get_id(file) for file in versions_files[v1]]):
0622         if is_relval_data:
0623             dataset_re = re.compile(unique_id[0]+'_')
0624             run_re = re.compile(unique_id[1])
0625             c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
0626             c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
0627         else:
0628             dataset_re = re.compile(unique_id[0]+'_')
0629             ps_re = re.compile(unique_id[1])
0630             ##compile a PU re and search also for same PU
0631             c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and ps_re.search(file)]
0632             c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and ps_re.search(file)]
0633 
0634         if len(c1_files) > 0 and len(c2_files) > 0:
0635             first_file = get_max_version(c1_files)
0636             second_file = get_max_version(c2_files)
0637             print('%s\n%s\n' % (first_file, second_file))
0638             pairs.extend((first_file, second_file))
0639     if verbose:
0640         print("Paired and got %d files.\n" % len(pairs))
0641     return pairs