Common/relval/harvestRelVal.py

0001 #!/usr/bin/env python
0002
0003 import sys
0004 import os
0005
0006 """
0007 arguments [<list-of-processes>]
0008 description:
0009 creates crab.cfg, multicrab.cfg, harvest_*.py
0010 if dbs is set:
0011  prints number of events found in dataset
0012  if no argument is provided looks for all available datsets for release
0013  user can edit multicrab and confirm process list as needed
0014 nuno@cern.ch 09.04
0015 """
0016
0017 def print_def():
0018     print("Usage:", sys.argv[0], "[list_of_processes]")
0019     print("Examples:")
0020     print("harvestRelVal.py")
0021     print("harvestRelVal.py /RelValTTbar/CMSSW_3_1_0_pre4_STARTUP_30X_v1/GEN-SIM-RECO")
0022     print("harvestRelVal.py <dataset_list.txt>")
0023
0024 def check_dbs():
0025     if os.getenv('DBSCMD_HOME','NOTSET') == 'NOTSET' :
0026         return 0
0027     return 1
0028
0029 def check_nevts_dset(dset):
0030     if not is_dbs :
0031         return -1
0032     ntot=0
0033     for afile in api.listFiles(path=str(dset)):
0034         nevts = afile['NumberOfEvents']
0035         ntot += nevts
0036         #print "  %s" % afile['LogicalFileName']
0037     return ntot
0038
0039 def make_dqmname(s):
0040     return  'DQM_V0001_R000000001' + s.replace('/','__') + '.root'
0041
0042 def get_name_from_dsetpath(ds):
0043     fs = ds.split('/')
0044     fa = fs[1].replace('RelVal','')
0045     return fa
0046
0047 def get_cond_from_dsetpath(ds) :
0048     ca = ds.split('/')[2].replace(cmssw_ver+'_','').replace('IDEAL_','').replace('STARTUP_','').replace('_FastSim','')
0049     cb = ca[:ca.find('v')-1]
0050     if cb[0].find('3') == -1 or len(cb) > 3:
0051         print("problem extracting condition for", ds, " : ", cb, '(len:',len(cb),')')
0052         if cb.find('31X') != -1:
0053             cb = '31X'
0054         elif cb.find('30X') != -1:
0055             cb = '30X'
0056         else:
0057             print("skipping", cb)
0058             return 0
0059         print("condition found:", cb)
0060     else :
0061         print("good condition for", ds, " : ", cb, '(len:',len(cb),')')
0062     return cb
0063
0064
0065 def make_dbs_list(dbslf) :
0066     if not is_dbs :
0067         return
0068     flis = open(dbslf,'w')
0069     for ads in api.listDatasetPaths() :
0070         if ads.find('RelVal') != -1 \
0071                or ads.find(cmssw_ver) != -1 \
0072                or ads.find("/GEN-SIM") != -1 :
0073 #               and ads.find("/GEN-SIM-RECO") != -1 :
0074             flis.write(ads + '\n')
0075     flis.close()
0076     print('Generated dataset list', dbslf, 'from dbs.')
0077     #exampe:
0078     #dbs lsd --path=/RelVal*/CMSSW_3_1_0_pre5*/GEN-SIM-RECO --url=http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet > mylist.txt
0079     #dbslsd = "dbs lsd --path=/RelVal*/" + cmssw_ver + "*/GEN-SIM-RECO --url=http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
0080     #os.system( '`' + dbslsd + ' > ' + dbslf + '`')
0081
0082 def read_ds_file() :
0083     if not os.path.exists(dsfile) :
0084         print("problem reading file", dsfile)
0085         sys.exit(30)
0086     fin = open(dsfile,'r')
0087     for dset in fin.readlines():
0088         d = dset.replace('\n','')
0089         if d.find('#') == -1 :
0090             dsetpaths.append(d)
0091         else :
0092             print('skipping:', d)
0093     fin.close()
0094     print('Using data set list in ', dsfile)
0095
0096 def check_dset() :
0097    #check cmssw consistency
0098    for s in dsetpaths:
0099        if s.find(cmssw_ver) == -1 :
0100            dsetpaths.remove(s)
0101            print('Inconsistency found with datset and cmssw version (', cmssw_ver, ')' \
0102                  ': \t ', s, ' has been removed.')
0103    #check conditions from dsetname
0104    for s in dsetpaths[:]: #nb:need to make a copy here!
0105        cond = get_cond_from_dsetpath(s)
0106        if cond  == 0 :
0107            dsetpaths.remove(s)
0108    #check list size
0109    nSamples = len(dsetpaths)
0110    if nSamples == 0 :
0111        print("Empty input list, exit.")
0112        sys.exit(12)
0113    else :
0114        print('Processing', nSamples, 'data sets.')
0115    #check event numbers
0116    nSampleEvts = list()
0117    for s in dsetpaths:
0118        nSampleEvts.append(check_nevts_dset(s))
0119    print('number of events per dataset:', nSampleEvts)
0120
0121 def find_dqmref(ds) :
0122     if not do_reference :
0123         return 'NONE'
0124     cp = cmssw_ver[-1:]
0125     ip = (int)(cp) - 1
0126     ref_ver = cmssw_ver.replace(cp,str(ip))
0127     #print "cms:", cmssw_ver, " cp:", cp, " ip:", ip, " new_ver:", ref_ver
0128     ref_dir = "/castor/cern.ch/user/n/nuno/relval/harvest/" + ref_ver + "/"
0129     ref_dsf = make_dqmname(ds.replace(cmssw_ver, ref_ver))
0130     gls = " | grep root | grep "
0131     #to accept crab appended _1.root in file names, nd skip versions/conditions
0132     gls += ref_dsf[:-25]
0133     gls += "| awk '{print $9}' "
0134     #print "refds:", ref_dsf, " command: rfdir", ref_dir+gls
0135     command = "rfcp " + ref_dir  + "`rfdir " + ref_dir + gls + "` ."
0136     #print "command:", command
0137     os.system(command)
0138     tmpfile = "ref.txt"
0139     command = "ls -rtl *" + gls + " > " + tmpfile
0140     #print "command:", command
0141     os.system(command)
0142     the_ref = 'NONE'
0143     if os.path.exists(tmpfile) :
0144         fin = open(tmpfile,'r')
0145         ref = fin.readline().replace('\n','')
0146         #print "read ref:", ref, "exists?", os.path.exists(ref)
0147         fin.close()
0148         if os.path.exists(ref) :
0149             the_ref = ref
0150     else :
0151         the_ref = 'NONE'
0152     print("Found reference file:", the_ref)
0153     return the_ref
0154
0155 def create_harvest(ds) :
0156     raw_cmsdriver = "cmsDriver.py harvest -s HARVESTING:validationHarvesting --mc  --conditions FrontierConditions_GlobalTag,STARTUP_30X::All --harvesting AtJobEnd --no_exec -n -1"
0157     cmsdriver = raw_cmsdriver
0158     cond = get_cond_from_dsetpath(ds)
0159     if cond == 0 :
0160         print('unexpected problem with conditions')
0161         sys.exit(50)
0162     cmsdriver = cmsdriver.replace('30X',cond)
0163     fin_name="harvest_HARVESTING_STARTUP.py"
0164     if ds.find('IDEAL') != -1 :
0165         cmsdriver = cmsdriver.replace('STARTUP','IDEAL')
0166         fin_name = fin_name.replace('STARTUP','IDEAL')
0167     if ds.find('FastSim') != -1:
0168         cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingFS')
0169     if ds.find('PileUp') != -1:
0170         cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingPU')
0171
0172     #print "=>", cmsdriver, " fs?", ds.find('FastSim')
0173     if os.path.exists(fin_name) :
0174         os.system("rm " + fin_name)
0175     print("executing cmsdriver command:\n\t", cmsdriver)
0176     os.system(cmsdriver)
0177     if not os.path.exists(fin_name) :
0178         print('problem with cmsdriver file name')
0179         sys.exit(40)
0180     os.system("touch " + fin_name)
0181     hf = make_harv_name(ds)
0182     os.system('mv ' + fin_name + " " + hf)
0183     out = open(hf, 'a')
0184     out.write("\n\n##additions to cmsDriver output \n")
0185     out.write("process.dqmSaver.workflow = '" + ds + "'\n")
0186     if is_dbs :
0187         out.write("process.source.fileNames = cms.untracked.vstring(\n")
0188         for afile in api.listFiles(path=ds):
0189             out.write("  '%s',\n" % afile['LogicalFileName'])
0190         out.write(")\n")
0191
0192     dqmref = find_dqmref(ds);
0193     if not dqmref == 'NONE' :
0194         out.write("process.dqmSaver.referenceHandling = 'all'\n")
0195
0196     out.close()
0197
0198 def create_mcrab(set, fcrab, fout):
0199     out = open(fout, 'w')
0200     out.write('[MULTICRAB]')
0201     out.write('\ncfg=' + fcrab)
0202     out.write('\n\n[COMMON]')
0203     nevt = -1
0204     njob = 1
0205     out.write('\nCMSSW.total_number_of_events=' + (str)(nevt) )
0206     out.write('\nCMSSW.number_of_jobs=' + (str)(njob) )
0207     for s in set:
0208         append_sample_mcrab(s, out)
0209     out.close()
0210
0211 def make_harv_name(dset) :
0212     return 'harvest_' + get_name_from_dsetpath(dset) + '.py'
0213
0214 def append_sample_mcrab(dsetp, fout):
0215     dqm = make_dqmname(dsetp)
0216     sample = get_name_from_dsetpath(dsetp)
0217     hf = make_harv_name(dsetp)
0218     if not os.path.exists(hf) :
0219         print('problem creating multicrab, file', hf, 'does not exist')
0220         sys.exit(17)
0221     fout.write('\n\n[' + sample + ']')
0222     fout.write('\nCMSSW.pset=' + hf)
0223     fout.write('\nCMSSW.datasetpath=' + dsetp)
0224     fout.write('\nCMSSW.output_file=' + dqm)
0225
0226     dqmref = find_dqmref(dsetp);
0227     if not dqmref == 'NONE' :
0228         fout.write('\nUSER.additional_input_files=' + dqmref)
0229
0230 def create_crab(ds) :
0231     dqmout = make_dqmname(ds)
0232     hf = make_harv_name(ds)
0233     out = open(f_crab, 'w')
0234     out.write(crab_block)
0235     out.write('\npset=' + hf)
0236     out.write('datasetpath=' + ds)
0237     out.write('\noutput_file=' + dqmout)
0238     out.close()
0239
0240 crab_block = """
0241 [CRAB]
0242 jobtype = cmssw
0243 scheduler = glite
0244
0245 [EDG]
0246 remove_default_blacklist=1
0247 rb = CERN
0248
0249 [USER]
0250 return_data = 1
0251 #copy_data = 1
0252 #storage_element=srm-cms.cern.ch
0253 #storage_path=/srm/managerv2?SFN=/castor/cern.ch
0254 #user_remote_dir=/user/n/nuno/test
0255 publish_data=0
0256 thresholdLevel=70
0257 eMail=nuno@cern.ch
0258
0259 [CMSSW]
0260 total_number_of_events=-1
0261 show_prod = 1
0262 number_of_jobs=1
0263 """
0264
0265
0266 #Check arg,settings
0267 input_type = ''
0268 argin = ''
0269 dsfile = ''
0270 do_reference = False
0271 if len(sys.argv) > 2 :
0272     print_def()
0273     sys.exit(10)
0274 elif len(sys.argv) == 1 :
0275     print("Will search for available datasets.")
0276     input_type = 'none'
0277 elif len(sys.argv) == 2 :
0278     argin = sys.argv[1]
0279     if os.path.exists(argin) :
0280         dsfile = argin
0281         #print 'Reading list of datasets from', dsfile
0282         input_type = 'file'
0283     elif argin.find('CMSSW') != -1 and argin.find('RelVal'):
0284         print('Using specified data set', argin)
0285         input_type = 'ds'
0286     else :
0287         print('Invalid argument: process list, dataset or file', \
0288                   argin, 'does not exist.')
0289         sys.exit(11)
0290
0291 #dbs
0292 is_dbs = check_dbs()
0293 if not is_dbs:
0294     print("dbs not set!")
0295 else:
0296     print("dbs home:", os.getenv('DBSCMD_HOME'))
0297     from DBSAPI.dbsApi import DbsApi
0298     from DBSAPI.dbsException import *
0299     from DBSAPI.dbsApiException import *
0300     from DBSAPI.dbsOptions import DbsOptionParser
0301     optManager  = DbsOptionParser()
0302     (opts,args) = optManager.getOpt()
0303     #api = DbsApi(opts.__dict__)
0304     args={}
0305     args['url']= "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
0306     api = DbsApi(args)
0307
0308 #cmssw
0309 cmssw_ver = os.getenv('CMSSW_VERSION','NOTSET')
0310 if cmssw_ver == 'NOTSET' :
0311     print("""
0312     cmssw not set!
0313     example:
0314       scramv1 p CMSSW CMSSW_3_1_0_pre5
0315       cd CMSSW_3_1_0_pre5/src
0316       eval `scramv1 runtime -sh`
0317       cd -
0318     """)
0319     sys.exit(12)
0320 else :
0321     print("Using cmssw version:", cmssw_ver)
0322
0323
0324 #read datasets
0325 dsetpaths = list()
0326
0327 if input_type == 'none' :
0328     if not is_dbs :
0329         print("no dataset specified, and dbs isn't set...")
0330         print_def()
0331         sys.exit(13)
0332     else :
0333         dsfile = cmssw_ver + "_dbslist.txt"
0334         make_dbs_list(dsfile)
0335         read_ds_file()
0336 elif input_type == 'file' :
0337     read_ds_file()
0338 elif input_type == 'ds' :
0339     dsetpaths.append(argin)
0340
0341
0342 #check dataset list: remove incompatible dsets
0343 check_dset()
0344
0345 #print dataset list to be processed
0346 print('data sets:', dsetpaths)
0347 dslproc = open("dset_processed.txt", 'w')
0348 for s in dsetpaths :
0349     dslproc.write(s+'\n')
0350 dslproc.close()
0351
0352
0353 ##Create harvest.py template
0354 create_harvest(dsetpaths[0])
0355
0356 ##Create crab.cfg template
0357 f_crab = 'crab.cfg'
0358 create_crab(dsetpaths[0])
0359
0360 ##Create harvest_n.py for individual datasets
0361 for s in dsetpaths:
0362     create_harvest(s)
0363
0364 ##Create multicrab.cfg
0365 f_multi_crab = 'multicrab.cfg'
0366 create_mcrab(dsetpaths, f_crab, f_multi_crab)
0367
0368 ##Print what has been created
0369
0370 harvfilelist = list()
0371 for s in dsetpaths:
0372     harvfilelist.append(make_harv_name(s))
0373
0374 print('\nCreated:\n\t %(pwd)s/%(cf)s \n\t %(pwd)s/%(mc)s' \
0375       % {'pwd' : os.environ["PWD"],'cf' : f_crab, 'mc' : f_multi_crab})
0376 print("\tIndividual harvest py's:\n\t", harvfilelist)
0377
0378 print("Done.")