RelMon/scripts/ValidationMatrix.py

0001 #! /usr/bin/env python3
0002 ################################################################################
0003 # RelMon: a tool for automatic Release Comparison
0004 # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
0005 #
0006 #
0007 #
0008 # Danilo Piparo CERN - danilo.piparo@cern.ch
0009 #
0010 ################################################################################
0011
0012 from builtins import range
0013 from optparse import OptionParser
0014
0015 import os
0016 import pickle
0017 import glob
0018 from re import search
0019 from subprocess import call,PIPE
0020 from multiprocessing import Pool
0021 from sys import exit
0022
0023 import sys
0024 argv=sys.argv
0025 sys.argv=[]
0026 if "RELMON_SA" in os.environ:
0027   import definitions as definitions
0028   from dqm_interfaces import DirWalkerFile,string2blacklist,DirWalkerFile_thread_wrapper
0029   from dirstructure import Directory
0030   from directories2html import directory2html,make_summary_table
0031   from utils import ask_ok, unpickler, make_files_pairs
0032 else:
0033   import Utilities.RelMon.definitions as definitions
0034   from Utilities.RelMon.dqm_interfaces import DirWalkerFile,string2blacklist,DirWalkerFile_thread_wrapper
0035   from Utilities.RelMon.dirstructure import Directory
0036   from Utilities.RelMon.directories2html import directory2html,make_summary_table
0037   from Utilities.RelMon.utils import ask_ok, unpickler, make_files_pairs
0038 sys.argv=argv
0039
0040 #-------------------------------------------------------------------------------
0041
0042 def name2sample(filename):
0043   namebase=os.path.basename(filename)
0044   return namebase.split("__")[1]
0045
0046 def name2version(filename):
0047   namebase=os.path.basename(filename)
0048   return namebase.split("__")[2]
0049
0050 def name2run(filename):
0051   namebase=os.path.basename(filename)
0052   return namebase.split("__")[0].split("_")[2]
0053
0054 def name2runskim(filename):
0055   run=name2run(filename)
0056   skim=name2version(filename).split("_")[-1]
0057   # remove skim version
0058   if "-v" in skim:
0059     skim = skim[:skim.rfind('-v')]
0060   return "%s_%s"%(run,skim)
0061
0062 def name2globaltag(filename):
0063   namebase = os.path.basename(filename)
0064   return namebase.split("__")[2].split("-")[1] #returns GT from file basename
0065
0066 #-------------------------------------------------------------------------------
0067
0068 def guess_params(ref_filenames,test_filenames):
0069
0070   if len(ref_filenames)*len(test_filenames)==0:
0071     print("Empty reference and test filenames lists!")
0072     return [],"",""
0073
0074   samples=[]
0075   ref_versions=[]
0076   test_versions=[]
0077
0078   for ref, test in zip(map(os.path.basename,ref_filenames),map(os.path.basename,test_filenames)):
0079
0080     ref_sample=name2sample(ref)
0081     ref_version=name2version(ref)
0082     test_sample=name2sample(test)
0083     test_version=name2version(test)
0084
0085     print("  ## sample 1: %s vs sample 2: %s"%(ref_sample, test_sample))
0086
0087     if ref_sample!=test_sample:
0088       print("Files %s and %s do not seem to be relative to the same sample." %(ref, test))
0089     #  exit(2)
0090
0091     # Slightly modify for data
0092     if search("20[01]",ref_version)!=None:
0093       ref_sample+=ref_version.split("_")[-1]
0094     samples.append(ref_sample)
0095
0096     # append the versions
0097     ref_versions.append(ref_version)
0098     test_versions.append(test_version)
0099
0100   # Check if ref and test versions are always the same.
0101   ref_versions=list(set(ref_versions))
0102   test_versions=list(set(test_versions))
0103
0104   #for versions in ref_versions,test_versions:
0105     #if len(versions)!=1:
0106       #print "More than one kind of CMSSW version selected (%s)" %versions
0107       #exit(2)
0108
0109   cmssw_version1=ref_versions[0]
0110   cmssw_version2=test_versions[0]
0111
0112   return samples,cmssw_version1,cmssw_version2
0113
0114
0115 #-------------------------------------------------------------------------------
0116
0117 def check_root_files(names_list):
0118   for name in names_list:
0119     if not name.endswith(".root"):
0120       print("File %s does not seem to be a rootfile. Please check.")
0121       return False
0122   return True
0123
0124 #-------------------------------------------------------------------------------
0125
0126 def add_to_blacklist(blacklist, pattern, target, blist_piece):
0127   int_pattern=pattern
0128   int_pattern=pattern.strip()
0129   flip_condition=False
0130   if int_pattern[0]=='!':
0131     int_pattern=int_pattern[1:]
0132     flip_condition=True
0133
0134   condition = search(int_pattern,target)!=None
0135   if flip_condition:
0136     condition = not condition
0137
0138   if condition:
0139     #print "Found %s in %s" %(pattern,target)
0140     if blacklist!="": # if not the first, add a comma
0141       blacklist+=","
0142     blacklist+=blist_piece
0143   #else:
0144     #print "  NOT Found %s in %s" %(pattern,target)
0145   return blacklist
0146
0147 #-------------------------------------------------------------------------------
0148
0149 def guess_blacklists(samples,ver1,ver2,hlt):
0150   """Build a blacklist for each sample accordind to a set of rules
0151   """
0152   blacklists={}
0153   for sample in samples:
0154     blacklists[sample]="FED@1,AlcaBeamMonitor@1,HLT@1,AlCaReco@1"
0155
0156     # HLT
0157     if hlt: #HLT
0158       blacklists[sample]+=",AlCaEcalPi0@2"
0159       if not search("2010+|2011+|2012+|2015+",ver1):
0160         print("We are treating MC files for the HLT")
0161         for pattern,blist in definitions.hlt_mc_pattern_blist_pairs:
0162           blacklists[sample]=add_to_blacklist(blacklists[sample],pattern,sample,blist)
0163       else:
0164         print("We are treating Data files for the HLT")
0165         # at the moment it does not make sense since hlt is ran already
0166
0167     else: #RECO
0168       #Monte Carlo
0169       if not search("2010+|2011+|2012+",ver1):
0170         print("We are treating MC files")
0171
0172         for pattern,blist in definitions.mc_pattern_blist_pairs:
0173           blacklists[sample]=add_to_blacklist(blacklists[sample],pattern,sample,blist)
0174 #          print "MC RECO"
0175           #print blacklists[sample]
0176
0177       # Data
0178       else:
0179         print("We are treating Data files:")
0180         blacklists[sample]+=",By__Lumi__Section@-1,AlCaReco@1"
0181         for pattern,blist in definitions.data_pattern_blist_pairs:
0182           blacklists[sample]=add_to_blacklist(blacklists[sample],pattern,ver1,blist)
0183 #         print "DATA RECO: %s %s %s -->%s" %( ver1, pattern, blist, blacklists[sample])
0184
0185
0186   return blacklists
0187
0188 #-------------------------------------------------------------------------------
0189
0190 def get_roofiles_in_dir(directory):
0191   print(directory)
0192   files_list = [s for s in os.listdir(directory) if s.endswith(".root")]
0193   files_list_path=map(lambda s: os.path.join(directory,s), files_list)
0194
0195   return files_list_path
0196
0197 #-------------------------------------------------------------------------------
0198
0199 def get_filenames_from_pool(all_samples):
0200
0201   # get a list of the files
0202   files_list=get_roofiles_in_dir(all_samples)
0203
0204   if len(files_list)==0:
0205     print("Zero files found in directory %s!" %all_samples)
0206     return [],[]
0207
0208   # Are they an even number?
0209   for name in files_list:
0210     print("* ",name)
0211   if len(files_list)%2!=0:
0212     print("The numbuer of file is not even... Trying to recover a catastrophe.")
0213
0214   files_list=make_files_pairs(files_list)
0215
0216   # Try to couple them according to their sample
0217   ref_filenames=[]
0218   test_filenames=[]
0219   #files_list.sort(key=name2version)
0220   #files_list.sort(key=name2sample)
0221   #files_list.sort(key=name2run)
0222   for iname in range(len(files_list)):
0223     filename=files_list[iname]
0224     if iname%2==0:
0225       ref_filenames.append(filename)
0226     else:
0227       test_filenames.append(filename)
0228
0229   print("The guess would be the following:")
0230   for ref,test in zip(ref_filenames,test_filenames):
0231     refbasedir=os.path.dirname(ref)
0232     testbasedir=os.path.dirname(test)
0233     dir_to_print=refbasedir
0234     if refbasedir!=testbasedir:
0235       dir_to_print="%s and %s" %(refbasedir,testbasedir)
0236     print("* Directory: %s " %dir_to_print)
0237     refname=os.path.basename(ref)
0238     testname=os.path.basename(test)
0239     print("  o %s" %refname)
0240     print("  o %s" %testname)
0241
0242   #is_ok=ask_ok("Is that ok?")
0243   #if not is_ok:
0244     #print "Manual input needed then!"
0245     #exit(2)
0246
0247
0248   return ref_filenames,test_filenames
0249
0250
0251 #-------------------------------------------------------------------------------
0252
0253 def get_clean_fileanames(ref_samples,test_samples):
0254   # Process the samples starting from the names
0255   ref_filenames=list(map(lambda s:s.strip(),ref_samples.split(",")))
0256   test_filenames=list(map(lambda s:s.strip(),test_samples.split(",")))
0257
0258   if len(ref_filenames)!=len(test_filenames):
0259     print("The numebr of reference and test files does not seem to be the same. Please check.")
0260     exit(2)
0261
0262   if not (check_root_files(ref_filenames) and check_root_files(test_filenames)):
0263     exit(2)
0264   return ref_filenames,test_filenames
0265
0266 #-------------------------------------------------------------------------------
0267
0268 def count_alive_processes(p_list):
0269   return len([p for p in p_list if p.returncode==None])
0270
0271 #-------------------------------------------------------------------------------
0272
0273 def call_compare_using_files(args):
0274   """Creates shell command to compare two files using compare_using_files.py
0275   script and calls it."""
0276   sample, ref_filename, test_filename, options = args
0277   gt = name2globaltag(ref_filename)
0278   blacklists=guess_blacklists([sample],name2version(ref_filename),name2version(test_filename),options.hlt)
0279   command = " compare_using_files.py "
0280   command+= "%s %s " %(ref_filename,test_filename)
0281   command+= " -C -R "
0282   if options.do_pngs:
0283     command+= " -p "
0284   command+= " -o %s_%s " %(sample, gt)
0285   # Change threshold to an experimental and empirical value of 10^-5
0286   command+= " --specify_run "
0287   if options.stat_test in ["Bin2Bin", "BinToBin"]:
0288     options.test_threshold = 0.9999
0289   command+= " -t %s " %options.test_threshold
0290   command+= " -s %s " %options.stat_test
0291
0292   # Inspect the HLT directories
0293   if options.hlt:
0294     command+=" -d HLT "
0295
0296   if options.hash_name:
0297     command += " --hash_name "
0298
0299   if options.blacklist_file:
0300     command += " --use_black_file "
0301
0302   if options.standalone:
0303     command += " --standalone "
0304   if len(blacklists[sample]) >0:
0305     command+= '-B %s ' %blacklists[sample]
0306   print("\nExecuting --  %s" %command)
0307
0308   process=call([x for x in command.split(" ") if len(x)>0])
0309   return process
0310
0311
0312 #--------------------------------------------------------------------------------
0313
0314 def do_comparisons_threaded(options):
0315
0316   n_processes= int(options.n_processes)
0317
0318   ref_filenames=[]
0319   test_filenames=[]
0320
0321   if len(options.all_samples)>0:
0322     ref_filenames,test_filenames=get_filenames_from_pool(options.all_samples)
0323   else:
0324     ref_filenames,test_filenames=get_clean_fileanames(options.ref_samples,options.test_samples)
0325
0326   # make the paths absolute
0327   ref_filenames=list(map(os.path.abspath,ref_filenames))
0328   test_filenames=list(map(os.path.abspath,test_filenames))
0329
0330   samples,cmssw_version1,cmssw_version2=guess_params(ref_filenames,test_filenames)
0331
0332   if len(samples)==0:
0333     print("No Samples found... Quitting")
0334     return 0
0335
0336 #  blacklists=guess_blacklists(samples,cmssw_version1,cmssw_version2,options.hlt)
0337
0338   # Launch the single comparisons
0339   original_dir=os.getcwd()
0340
0341   outdir=options.out_dir
0342   if len(outdir)==0:
0343     print("Creating automatic outdir:", end=' ')
0344     outdir="%sVS%s" %(cmssw_version1,cmssw_version2)
0345     print(outdir)
0346   if len(options.input_dir)==0:
0347     print("Creating automatic indir:", end=' ')
0348     options.input_dir=outdir
0349     print(options.input_dir)
0350
0351   if not os.path.exists(outdir):
0352     os.mkdir(outdir)
0353   os.chdir(outdir)
0354
0355   # adjust the number of threads
0356   n_comparisons=len(ref_filenames)
0357   if n_comparisons < n_processes:
0358     print("Less comparisons than possible processes: reducing n processes to", end=' ')
0359     n_processes=n_comparisons
0360   #elif n_processes/n_comparisons == 0:
0361     #print "More comparisons than possible processes, can be done in N rounds: reducing n processes to",
0362     #original_nprocesses=n_processes
0363     #first=True
0364     #n_bunches=0
0365     #while first or n_processes > original_nprocesses:
0366       #n_processes=n_comparisons/2
0367       #if n_comparisons%2 !=0:
0368         #n_processes+=1
0369       #first=False
0370
0371     #print n_processes
0372   #print n_processes
0373
0374   # Test if we treat data
0375   skim_name=""
0376   if search("20[01]",cmssw_version1)!=None:
0377     skim_name=cmssw_version1.split("_")[-1]
0378
0379   running_subprocesses=[]
0380   process_counter=0
0381   #print ref_filenames
0382
0383   ## Compare all pairs of root files
0384   pool = Pool(n_processes)
0385   args_iterable = [list(args) + [options] for args in zip(samples, ref_filenames, test_filenames)]
0386   pool.map(call_compare_using_files, args_iterable)
0387   # move the pickles on the top, hack
0388   os.system("mv */*pkl .")
0389
0390   os.chdir("..")
0391 #-------------------------------------------------------------------------------
0392 def do_reports(indir):
0393   #print indir
0394   os.chdir(indir)
0395   pkl_list=[x for x in os.listdir("./") if ".pkl" in x]
0396   running_subprocesses=[]
0397   n_processes=int(options.n_processes)
0398   process_counter=0
0399   for pklfilename in pkl_list:
0400     command = "compare_using_files.py "
0401     command+= "-R "
0402     if options.do_pngs:
0403       command+= " -p "
0404     command+= "-P %s " %pklfilename
0405     command+= "-o %s " %pklfilename[:-4]
0406     print("Executing %s" %command)
0407     process=call([x for x in command.split(" ") if len(x)>0])
0408     process_counter+=1
0409     # add it to the list
0410     running_subprocesses.append(process)
0411     if process_counter>=n_processes:
0412       process_counter=0
0413       for p in running_subprocesses:
0414         #print "Waiting for %s" %p.name
0415         p.wait()
0416
0417   os.chdir("..")
0418
0419 #-------------------------------------------------------------------------------
0420 def do_html(options, hashing_flag, standalone):
0421
0422   if options.reports:
0423     print("Preparing reports for the single files...")
0424     do_reports(options.input_dir)
0425   # Do the summary page
0426   aggregation_rules={}
0427   aggregation_rules_twiki={}
0428   # check which aggregation rules are to be used
0429   if options.hlt:
0430     print("Aggregating directories according to HLT rules")
0431     aggregation_rules=definitions.aggr_pairs_dict['HLT']
0432     aggregation_rules_twiki=definitions.aggr_pairs_twiki_dict['HLT']
0433   else:
0434     aggregation_rules=definitions.aggr_pairs_dict['reco']
0435     aggregation_rules_twiki=definitions.aggr_pairs_twiki_dict['reco']
0436   table_html = make_summary_table(options.input_dir,aggregation_rules,aggregation_rules_twiki, hashing_flag, standalone)
0437
0438   # create summary html file
0439   ofile = open("RelMonSummary.html","w")
0440   ofile.write(table_html)
0441   ofile.close()
0442
0443 #-------------------------------------------------------------------------------
0444
0445 if __name__ == "__main__":
0446
0447   #-----------------------------------------------------------------------------
0448   ref_samples=""
0449   test_samples=""
0450   all_samples=""
0451   n_processes=1
0452   out_dir=""
0453   in_dir=""
0454   n_threads=1 # do not change this
0455   run=-1
0456   stat_test="Chi2"
0457   test_threshold=0.00001
0458   hlt=False
0459   #-----------------------------------------------------------------------------
0460
0461
0462   parser = OptionParser(usage="usage: %prog [options]")
0463
0464   parser.add_option("-R","--ref_samples ",
0465                     action="store",
0466                     dest="ref_samples",
0467                     default=ref_samples,
0468                     help="The samples that act as reference (comma separated list)")
0469
0470   parser.add_option("-T","--test_samples",
0471                     action="store",
0472                     dest="test_samples",
0473                     default=test_samples,
0474                     help="The samples to be tested (comma separated list)")
0475
0476   parser.add_option("-a","--all_samples",
0477                     action="store",
0478                     dest="all_samples",
0479                     default=all_samples,
0480                     help="EXPERIMENTAL: Try to sort all samples selected (wildacrds) and organise a comparison")
0481
0482   parser.add_option("-o","--out_dir",
0483                     action="store",
0484                     dest="out_dir",
0485                     default=out_dir,
0486                     help="The outdir other than <Version1>VS<Version2>")
0487
0488   parser.add_option("-p","--do_pngs",
0489                     action="store_true",
0490                     dest="do_pngs",
0491                     default=False,
0492                     help="EXPERIMENTAL!!! Do the pngs of the comparison (takes 50%% of the total running time) \n(default is %s)" %False)
0493
0494   parser.add_option("-r","--run ",
0495                     action="store",
0496                     dest="run",
0497                     default=run,
0498                     help="The run to be checked \n(default is %s)" %run)
0499
0500   parser.add_option("-t","--test_threshold",
0501                     action="store",
0502                     dest="test_threshold",
0503                     default=test_threshold,
0504                     help="Threshold for the statistical test \n(default is %s)" %test_threshold)
0505
0506   parser.add_option("-s","--stat_test",
0507                     action="store",
0508                     dest="stat_test",
0509                     default=stat_test,
0510                     help="Statistical test (KS or Chi2) \n(default is %s)" %stat_test)
0511
0512   parser.add_option("-N","--numberOfProcesses",
0513                     action="store",
0514                     dest="n_processes",
0515                     default=n_processes,
0516                     help="Number of parallel processes to be run. Be Polite! \n(default is %s)" %n_processes)
0517
0518   parser.add_option("--HLT",
0519                     action="store_true",
0520                     dest="hlt",
0521                     default=False,
0522                     help="Analyse HLT histograms\n(default is %s)" %hlt)
0523
0524   parser.add_option("-i","--input_dir",
0525                     action="store",
0526                     dest="input_dir",
0527                     default=in_dir,
0528                     help="Input directory for html creation \n(default is %s)" %in_dir)
0529
0530   parser.add_option("--reports",
0531                     action="store_true",
0532                     dest="reports",
0533                     default=False,
0534                     help="Do the reports for the pickles \n(default is %s)" %in_dir)
0535 ##---HASHING---##
0536   parser.add_option("--hash_name",
0537                     action="store_true",
0538                     dest="hash_name",
0539                     default=False,
0540                     help="Set if you want to minimize & hash the output HTML files.")
0541 ##--Blacklist File --##
0542   parser.add_option("--use_black_file",
0543                     action="store_true",
0544                     dest="blacklist_file",
0545                     default=False,
0546                     help="Use a black list file of histograms located @ /RelMon/data")
0547 ##-- USE CSS files in web access, for stand-alone usage --##
0548   parser.add_option("--standalone",
0549                   action="store_true",
0550                   dest="standalone",
0551                   default=False,
0552                   help="Define that using RelMon in standalone method. Makes CSS files accessible over HTTP")
0553
0554   (options, args) = parser.parse_args()
0555
0556   if len(options.test_samples)*len(options.ref_samples)+len(options.all_samples)==0 and len(options.input_dir)==0:
0557     print("No samples given as input.")
0558     parser.print_help()
0559     exit(2)
0560
0561   if len(options.all_samples)>0 or (len(options.ref_samples)*len(options.test_samples)>0):
0562     do_comparisons_threaded(options)
0563   if len(options.input_dir)>0:
0564     do_html(options, options.hash_name, options.standalone)
0565
0566
0567
0568
0569
0570
0571
0572
0573
0574
0575
0576