Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-02-14 14:20:50

0001 #!/usr/bin/env python
0002 
0003 from __future__ import print_function
0004 import sys
0005 import os
0006 
0007 """
0008 arguments [<list-of-processes>]
0009 description:
0010 creates crab.cfg, multicrab.cfg, harvest_*.py
0011 if dbs is set:
0012  prints number of events found in dataset
0013  if no argument is provided looks for all available datsets for release
0014  user can edit multicrab and confirm process list as needed
0015 nuno@cern.ch 09.04
0016 """
0017 
0018 def print_def():
0019     print("Usage:", sys.argv[0], "[list_of_processes]")
0020     print("Examples:")
0021     print("harvestRelVal.py")
0022     print("harvestRelVal.py /RelValTTbar/CMSSW_3_1_0_pre4_STARTUP_30X_v1/GEN-SIM-RECO")
0023     print("harvestRelVal.py <dataset_list.txt>")
0024 
0025 def check_dbs():
0026     if os.getenv('DBSCMD_HOME','NOTSET') == 'NOTSET' :
0027         return 0
0028     return 1
0029 
0030 def check_nevts_dset(dset):
0031     if not is_dbs :
0032         return -1
0033     ntot=0
0034     for afile in api.listFiles(path=str(dset)):
0035         nevts = afile['NumberOfEvents']
0036         ntot += nevts
0037         #print "  %s" % afile['LogicalFileName']
0038     return ntot  
0039 
0040 def make_dqmname(s):
0041     return  'DQM_V0001_R000000001' + s.replace('/','__') + '.root' 
0042 
0043 def get_name_from_dsetpath(ds):
0044     fs = ds.split('/')
0045     fa = fs[1].replace('RelVal','')
0046     return fa
0047 
0048 def get_cond_from_dsetpath(ds) :
0049     ca = ds.split('/')[2].replace(cmssw_ver+'_','').replace('IDEAL_','').replace('STARTUP_','').replace('_FastSim','')
0050     cb = ca[:ca.find('v')-1]
0051     if cb[0].find('3') == -1 or len(cb) > 3:
0052         print("problem extracting condition for", ds, " : ", cb, '(len:',len(cb),')')  
0053         if cb.find('31X') != -1:
0054             cb = '31X'
0055         elif cb.find('30X') != -1:
0056             cb = '30X'
0057         else:
0058             print("skipping", cb)
0059             return 0
0060         print("condition found:", cb)
0061     else :
0062         print("good condition for", ds, " : ", cb, '(len:',len(cb),')')      
0063     return cb
0064 
0065 
0066 def make_dbs_list(dbslf) :
0067     if not is_dbs :
0068         return
0069     flis = open(dbslf,'w')
0070     for ads in api.listDatasetPaths() :
0071         if ads.find('RelVal') != -1 \
0072                or ads.find(cmssw_ver) != -1 \
0073                or ads.find("/GEN-SIM") != -1 : 
0074 #               and ads.find("/GEN-SIM-RECO") != -1 : 
0075             flis.write(ads + '\n')
0076     flis.close()
0077     print('Generated dataset list', dbslf, 'from dbs.') 
0078     #exampe:
0079     #dbs lsd --path=/RelVal*/CMSSW_3_1_0_pre5*/GEN-SIM-RECO --url=http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet > mylist.txt
0080     #dbslsd = "dbs lsd --path=/RelVal*/" + cmssw_ver + "*/GEN-SIM-RECO --url=http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
0081     #os.system( '`' + dbslsd + ' > ' + dbslf + '`')
0082 
0083 def read_ds_file() :
0084     if not os.path.exists(dsfile) :
0085         print("problem reading file", dsfile)
0086         sys.exit(30)
0087     fin = open(dsfile,'r')
0088     for dset in fin.readlines(): 
0089         d = dset.replace('\n','')
0090         if d.find('#') == -1 :
0091             dsetpaths.append(d)
0092         else :
0093             print('skipping:', d)
0094     fin.close()
0095     print('Using data set list in ', dsfile)
0096 
0097 def check_dset() :
0098    #check cmssw consistency
0099    for s in dsetpaths:
0100        if s.find(cmssw_ver) == -1 :
0101            dsetpaths.remove(s)        
0102            print('Inconsistency found with datset and cmssw version (', cmssw_ver, ')' \
0103                  ': \t ', s, ' has been removed.')
0104    #check conditions from dsetname
0105    for s in dsetpaths[:]: #nb:need to make a copy here!
0106        cond = get_cond_from_dsetpath(s)
0107        if cond  == 0 : 
0108            dsetpaths.remove(s)        
0109    #check list size
0110    nSamples = len(dsetpaths)
0111    if nSamples == 0 :
0112        print("Empty input list, exit.")
0113        sys.exit(12)
0114    else :
0115        print('Processing', nSamples, 'data sets.')
0116    #check event numbers
0117    nSampleEvts = list()
0118    for s in dsetpaths:
0119        nSampleEvts.append(check_nevts_dset(s))
0120    print('number of events per dataset:', nSampleEvts)
0121 
0122 def find_dqmref(ds) :
0123     if not do_reference :
0124         return 'NONE'
0125     cp = cmssw_ver[-1:]
0126     ip = (int)(cp) - 1
0127     ref_ver = cmssw_ver.replace(cp,str(ip))
0128     #print "cms:", cmssw_ver, " cp:", cp, " ip:", ip, " new_ver:", ref_ver  
0129     ref_dir = "/castor/cern.ch/user/n/nuno/relval/harvest/" + ref_ver + "/"
0130     ref_dsf = make_dqmname(ds.replace(cmssw_ver, ref_ver))
0131     gls = " | grep root | grep "
0132     #to accept crab appended _1.root in file names, nd skip versions/conditions
0133     gls += ref_dsf[:-25] 
0134     gls += "| awk '{print $9}' "
0135     #print "refds:", ref_dsf, " command: rfdir", ref_dir+gls
0136     command = "rfcp " + ref_dir  + "`rfdir " + ref_dir + gls + "` ."
0137     #print "command:", command
0138     os.system(command)
0139     tmpfile = "ref.txt"
0140     command = "ls -rtl *" + gls + " > " + tmpfile
0141     #print "command:", command
0142     os.system(command)
0143     the_ref = 'NONE'
0144     if os.path.exists(tmpfile) :
0145         fin = open(tmpfile,'r')
0146         ref = fin.readline().replace('\n','')
0147         #print "read ref:", ref, "exists?", os.path.exists(ref)
0148         fin.close()
0149         if os.path.exists(ref) :
0150             the_ref = ref
0151     else :
0152         the_ref = 'NONE'
0153     print("Found reference file:", the_ref)
0154     return the_ref
0155 
0156 def create_harvest(ds) :
0157     raw_cmsdriver = "cmsDriver.py harvest -s HARVESTING:validationHarvesting --mc  --conditions FrontierConditions_GlobalTag,STARTUP_30X::All --harvesting AtJobEnd --no_exec -n -1"
0158     cmsdriver = raw_cmsdriver
0159     cond = get_cond_from_dsetpath(ds)
0160     if cond == 0 :
0161         print('unexpected problem with conditions')
0162         sys.exit(50)
0163     cmsdriver = cmsdriver.replace('30X',cond)
0164     fin_name="harvest_HARVESTING_STARTUP.py"
0165     if ds.find('IDEAL') != -1 :
0166         cmsdriver = cmsdriver.replace('STARTUP','IDEAL')
0167         fin_name = fin_name.replace('STARTUP','IDEAL')
0168     if ds.find('FastSim') != -1:
0169         cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingFS')
0170     if ds.find('PileUp') != -1:
0171         cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingPU')
0172 
0173     #print "=>", cmsdriver, " fs?", ds.find('FastSim')
0174     if os.path.exists(fin_name) : 
0175         os.system("rm " + fin_name)
0176     print("executing cmsdriver command:\n\t", cmsdriver)
0177     os.system(cmsdriver)
0178     if not os.path.exists(fin_name) : 
0179         print('problem with cmsdriver file name')
0180         sys.exit(40)
0181     os.system("touch " + fin_name)
0182     hf = make_harv_name(ds)
0183     os.system('mv ' + fin_name + " " + hf)
0184     out = open(hf, 'a')
0185     out.write("\n\n##additions to cmsDriver output \n")
0186     out.write("process.dqmSaver.workflow = '" + ds + "'\n")
0187     if is_dbs :
0188         out.write("process.source.fileNames = cms.untracked.vstring(\n")
0189         for afile in api.listFiles(path=ds):
0190             out.write("  '%s',\n" % afile['LogicalFileName'])
0191         out.write(")\n")
0192 
0193     dqmref = find_dqmref(ds);
0194     if not dqmref == 'NONE' : 
0195         out.write("process.dqmSaver.referenceHandling = 'all'\n")
0196 
0197     out.close()
0198 
0199 def create_mcrab(set, fcrab, fout):
0200     out = open(fout, 'w')
0201     out.write('[MULTICRAB]')
0202     out.write('\ncfg=' + fcrab)
0203     out.write('\n\n[COMMON]')
0204     nevt = -1
0205     njob = 1
0206     out.write('\nCMSSW.total_number_of_events=' + (str)(nevt) )
0207     out.write('\nCMSSW.number_of_jobs=' + (str)(njob) )
0208     for s in set:
0209         append_sample_mcrab(s, out)
0210     out.close()    
0211 
0212 def make_harv_name(dset) :
0213     return 'harvest_' + get_name_from_dsetpath(dset) + '.py' 
0214 
0215 def append_sample_mcrab(dsetp, fout):
0216     dqm = make_dqmname(dsetp)
0217     sample = get_name_from_dsetpath(dsetp)
0218     hf = make_harv_name(dsetp)
0219     if not os.path.exists(hf) :
0220         print('problem creating multicrab, file', hf, 'does not exist')
0221         sys.exit(17)
0222     fout.write('\n\n[' + sample + ']')
0223     fout.write('\nCMSSW.pset=' + hf)
0224     fout.write('\nCMSSW.datasetpath=' + dsetp)
0225     fout.write('\nCMSSW.output_file=' + dqm)
0226 
0227     dqmref = find_dqmref(dsetp);
0228     if not dqmref == 'NONE' : 
0229         fout.write('\nUSER.additional_input_files=' + dqmref)
0230 
0231 def create_crab(ds) :
0232     dqmout = make_dqmname(ds)
0233     hf = make_harv_name(ds)
0234     out = open(f_crab, 'w')
0235     out.write(crab_block)
0236     out.write('\npset=' + hf)
0237     out.write('datasetpath=' + ds)
0238     out.write('\noutput_file=' + dqmout)
0239     out.close()
0240 
0241 crab_block = """
0242 [CRAB]
0243 jobtype = cmssw
0244 scheduler = glite
0245 
0246 [EDG]
0247 remove_default_blacklist=1
0248 rb = CERN
0249 
0250 [USER]
0251 return_data = 1
0252 #copy_data = 1
0253 #storage_element=srm-cms.cern.ch
0254 #storage_path=/srm/managerv2?SFN=/castor/cern.ch
0255 #user_remote_dir=/user/n/nuno/test
0256 publish_data=0
0257 thresholdLevel=70
0258 eMail=nuno@cern.ch
0259 
0260 [CMSSW]
0261 total_number_of_events=-1
0262 show_prod = 1
0263 number_of_jobs=1
0264 """
0265 
0266 
0267 #Check arg,settings
0268 input_type = ''
0269 argin = ''
0270 dsfile = ''
0271 do_reference = False
0272 if len(sys.argv) > 2 : 
0273     print_def()
0274     sys.exit(10) 
0275 elif len(sys.argv) == 1 : 
0276     print("Will search for available datasets.")
0277     input_type = 'none'
0278 elif len(sys.argv) == 2 : 
0279     argin = sys.argv[1]
0280     if os.path.exists(argin) :
0281         dsfile = argin
0282         #print 'Reading list of datasets from', dsfile
0283         input_type = 'file'
0284     elif argin.find('CMSSW') != -1 and argin.find('RelVal'): 
0285         print('Using specified data set', argin)
0286         input_type = 'ds'
0287     else :
0288         print('Invalid argument: process list, dataset or file', \
0289                   argin, 'does not exist.')
0290         sys.exit(11) 
0291 
0292 #dbs
0293 is_dbs = check_dbs()
0294 if not is_dbs:
0295     print("dbs not set!")
0296 else:
0297     print("dbs home:", os.getenv('DBSCMD_HOME'))
0298     from DBSAPI.dbsApi import DbsApi
0299     from DBSAPI.dbsException import *
0300     from DBSAPI.dbsApiException import *
0301     from DBSAPI.dbsOptions import DbsOptionParser
0302     optManager  = DbsOptionParser()
0303     (opts,args) = optManager.getOpt()
0304     #api = DbsApi(opts.__dict__)
0305     args={}
0306     args['url']= "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
0307     api = DbsApi(args)
0308 
0309 #cmssw
0310 cmssw_ver = os.getenv('CMSSW_VERSION','NOTSET')
0311 if cmssw_ver == 'NOTSET' :
0312     print("""
0313     cmssw not set!
0314     example:
0315       scramv1 p CMSSW CMSSW_3_1_0_pre5
0316       cd CMSSW_3_1_0_pre5/src
0317       eval `scramv1 runtime -sh`
0318       cd -
0319     """)
0320     sys.exit(12) 
0321 else :
0322     print("Using cmssw version:", cmssw_ver)
0323     
0324 
0325 #read datasets
0326 dsetpaths = list()
0327 
0328 if input_type == 'none' :
0329     if not is_dbs :
0330         print("no dataset specified, and dbs isn't set...")
0331         print_def()
0332         sys.exit(13)
0333     else :
0334         dsfile = cmssw_ver + "_dbslist.txt"
0335         make_dbs_list(dsfile)
0336         read_ds_file()
0337 elif input_type == 'file' :
0338     read_ds_file()
0339 elif input_type == 'ds' :
0340     dsetpaths.append(argin)
0341 
0342 
0343 #check dataset list: remove incompatible dsets
0344 check_dset()
0345 
0346 #print dataset list to be processed
0347 print('data sets:', dsetpaths)
0348 dslproc = open("dset_processed.txt", 'w')
0349 for s in dsetpaths :
0350     dslproc.write(s+'\n')
0351 dslproc.close()
0352 
0353 
0354 ##Create harvest.py template
0355 create_harvest(dsetpaths[0])
0356 
0357 ##Create crab.cfg template
0358 f_crab = 'crab.cfg'
0359 create_crab(dsetpaths[0])
0360 
0361 ##Create harvest_n.py for individual datasets
0362 for s in dsetpaths:
0363     create_harvest(s)
0364 
0365 ##Create multicrab.cfg
0366 f_multi_crab = 'multicrab.cfg'
0367 create_mcrab(dsetpaths, f_crab, f_multi_crab)
0368 
0369 ##Print what has been created
0370 
0371 harvfilelist = list()
0372 for s in dsetpaths:
0373     harvfilelist.append(make_harv_name(s))
0374 
0375 print('\nCreated:\n\t %(pwd)s/%(cf)s \n\t %(pwd)s/%(mc)s' \
0376       % {'pwd' : os.environ["PWD"],'cf' : f_crab, 'mc' : f_multi_crab})
0377 print("\tIndividual harvest py's:\n\t", harvfilelist)
0378 
0379 print("Done.")