Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 11:56:36

0001 #!/usr/bin/env python3
0002 #  Submit jobs that are setup in local mps database to batch system
0003 #
0004 #  The bsub sytax: bsub -J 'jobname' -q 'queue name' theProgram
0005 #  The jobname will be something like MP_2015.
0006 #  The queue name is derived from lib.classInfo.
0007 #  The program is theScrip.sh located in each job-directory.
0008 #  There may be the other option -R (see man bsub for info).
0009 #
0010 #  Usage:
0011 #
0012 #  mps_fire.py [-a] [-m [-f]] [maxjobs]
0013 #  mps_fire.py -h
0014 
0015 from __future__ import print_function
0016 from builtins import range
0017 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
0018 import Alignment.MillePedeAlignmentAlgorithm.mpslib.tools as mps_tools
0019 import os
0020 import sys
0021 import glob
0022 import shutil
0023 if sys.version_info[0]>2:
0024   import _pickle as cPickle
0025 else:
0026   import cPickle
0027 import subprocess
0028 import re
0029 import argparse
0030 
0031 def forward_proxy(rundir):
0032     """Forward proxy to location visible from the batch system.
0033 
0034     Arguments:
0035     - `rundir`: directory for storing the forwarded proxy
0036     """
0037 
0038     if not mps_tools.check_proxy():
0039         print("Please create proxy via 'voms-proxy-init -voms cms -rfc'.")
0040         sys.exit(1)
0041 
0042     local_proxy = subprocess.check_output(["voms-proxy-info", "--path"]).decode().strip()
0043     shutil.copyfile(local_proxy, os.path.join(rundir,".user_proxy"))
0044 
0045 
0046 def write_HTCondor_submit_file_pede(path, script, config, lib):
0047     """Writes 'job.submit' file in `path`.
0048 
0049     Arguments:
0050     - `path`: job directory
0051     - `script`: script to be executed
0052     - `config`: cfg file
0053     - `lib`: MPS lib object
0054     """
0055 
0056     resources = lib.get_class("pede").split("_")[1:] # strip off 'htcondor'
0057     job_flavour = resources[-1]
0058 
0059     job_submit_template="""\
0060 universe              = vanilla
0061 executable            = {script:s}
0062 output                = {jobm:s}/STDOUT
0063 error                 = {jobm:s}/STDOUT
0064 log                   = {jobm:s}/HTCJOB
0065 notification          = Always
0066 transfer_output_files = ""
0067 request_memory        = {pedeMem:d}M
0068 
0069 # adapted to space used on eos for binaries:
0070 request_disk          = {disk:d}
0071 
0072 # adapted to threads parameter in pede options and number of available cores
0073 request_cpus          = {cpus:d}
0074 
0075 +JobFlavour           = "{flavour:s}"
0076 """
0077     if "bigmem" in resources:
0078         job_submit_template += """\
0079 +BigMemJob            = True
0080 +AccountingGroup      = "group_u_CMS.e_cms_caf_bigmem"
0081 
0082 # automatically remove the job if the submitter has no permissions to run a BigMemJob
0083 periodic_remove       = !regexp("group_u_CMS.e_cms_caf_bigmem", AccountingGroup) && BigMemJob =?= True
0084 """
0085     job_submit_template += "\nqueue\n"
0086 
0087     print("Determine number of pede threads...")
0088     cms_process = mps_tools.get_process_object(os.path.join(Path, mergeCfg))
0089     pede_options = cms_process.AlignmentProducer.algoConfig.pedeSteerer.options.value()
0090     n_threads = 1
0091     for option in pede_options:
0092         if "threads" in option:
0093             n_threads = option.replace("threads", "").strip()
0094             n_threads = max(map(lambda x: int(x), n_threads.split()))
0095             break
0096     if n_threads > 16: n_threads = 16 # HTCondor machines have (currently) 16
0097                                       # cores, i.e. we ensure here that the job
0098                                       # would fit core-wise on one machine
0099 
0100     print("Determine required disk space on remote host...")
0101     # determine usage by each file instead of whole directory as this is what
0102     # matters for the specified disk usage:
0103     spco = subprocess.check_output # to make code below more less verbose
0104     opj = os.path.join             # dito
0105     cmd = ["du", "--apparent-size"]
0106     disk_usage = [int(item.split()[0])
0107                   for directory in ("binaries", "monitors", "tree_files")
0108                   for item
0109                   in spco(cmd+
0110                           glob.glob(opj(lib.mssDir, directory, "*"))).decode().splitlines()]
0111     disk_usage = sum(disk_usage)
0112     disk_usage *= 1.1 # reserve 10% additional space
0113 
0114     job_submit_file = os.path.join(Path, "job.submit")
0115     with open(job_submit_file, "w") as f:
0116         f.write(job_submit_template.format(script = os.path.abspath(script),
0117                                            jobm = os.path.abspath(path),
0118                                            pedeMem = lib.pedeMem,
0119                                            disk = int(disk_usage),
0120                                            cpus = n_threads,
0121                                            flavour = job_flavour))
0122 
0123     return job_submit_file
0124 
0125 def write_HTCondor_submit_file_mille(path, script, lib, proxy_path=None):
0126     """Writes 'job.submit' file in `path`.
0127 
0128     Arguments:
0129     - `path`: job directory
0130     - `script`: script to be executed
0131     - `lib`: MPS lib object
0132     - `proxy_path`: path to proxy (only used in case of requested proxy forward)
0133     """
0134 
0135     resources = lib.get_class("mille").split("_")[1:] # strip off 'htcondor'
0136     job_flavour = resources[-1]
0137 
0138     job_submit_template="""\
0139 universe              = vanilla
0140 executable            = {script:s}
0141 output                = {jobm:s}/STDOUT
0142 error                 = {jobm:s}/STDOUT
0143 log                   = {jobm:s}/HTCJOB
0144 notification          = Always
0145 transfer_output_files = ""
0146 
0147 +JobFlavour           = "{flavour:s}"
0148 """
0149     if "cafalca" in resources:
0150         job_submit_template += """\
0151 +CAFJob              = True
0152 +AccountingGroup     = "group_u_CMS.CAF.ALCA"
0153 # automatically remove the job if the submitter has no permissions to run a CAF Job
0154 periodic_remove       = !regexp("group_u_CMS.CAF.ALCA", AccountingGroup) && CAFJob =?= True
0155 """
0156 
0157     if proxy_path is not None:
0158         job_submit_template += """\
0159 +x509userproxy        = "{proxy:s}"
0160 """
0161     job_submit_template += "\nqueue\n"
0162 
0163     job_submit_file = os.path.join(Path, "job.submit")
0164     with open(job_submit_file, "w") as f:
0165         f.write(job_submit_template.format(script = os.path.abspath(script),
0166                                            jobm = os.path.abspath(path),
0167                                            flavour = job_flavour,
0168                                            proxy = proxy_path))
0169 
0170     return job_submit_file
0171 
0172 
0173 
0174 parser = argparse.ArgumentParser(
0175         description="Submit jobs that are setup in local mps database to batch system.",
0176 )
0177 parser.add_argument("maxJobs", type=int, nargs='?', default=1,
0178                     help="number of Mille jobs to be submitted (default: %(default)d)")
0179 parser.add_argument("-j", "--job-id", dest = "job_id", nargs = "*",
0180                     help = ("job IDs to be submitted; "
0181                             "use either 'job<ID>' or directly '<ID>'"))
0182 parser.add_argument("-r", "--resubmit", dest = "resub", default=False, 
0183                     action="store_true",
0184                     help = ("resubmit jobs - only works if job IDs are specified"))
0185 parser.add_argument("-a", "--all", dest="allMille", default=False,
0186                     action="store_true",
0187                     help = ("submit all setup Mille jobs; "
0188                             "maxJobs and --job-id are ignored"))
0189 parser.add_argument("-m", "--merge", dest="fireMerge", default=False,
0190                     action="store_true",
0191                     help = ("submit all setup Pede jobs; "
0192                             "maxJobs is ignored, but --job-id is respected"))
0193 parser.add_argument("-f", "--force-merge", dest="forceMerge", default=False,
0194                     action="store_true",
0195                     help=("force the submission of the Pede job in case some "+
0196                           "Mille jobs are not in the OK state"))
0197 parser.add_argument("--force-merge-manual", dest="forceMergeManual", default=False,
0198                     action="store_true",
0199                     help=("force the submission of the Pede job in case some "+
0200                           "Mille jobs are not in the OK state. Unlike --forceMerge "+
0201                           "this option assumes the user has edited theScript.sh and "+
0202                           "alignment_merge.py to consistently pick up only the mille "+
0203                           "output files that exist"))
0204 parser.add_argument("-p", "--forward-proxy", dest="forwardProxy", default=False,
0205                     action="store_true",
0206                     help="forward VOMS proxy to batch system")
0207 args = parser.parse_args(sys.argv[1:])
0208 
0209 
0210 lib = mpslib.jobdatabase()
0211 lib.read_db()
0212 
0213 if args.allMille:
0214     # submit all Mille jobs and ignore 'maxJobs' supplied by user
0215     args.maxJobs = lib.nJobs
0216     args.job_id = None
0217 
0218 if args.job_id is None and args.resub:
0219     print("Can only resubmit jobs if job IDs are specified")
0220     sys.exit(1)
0221 
0222 
0223 if args.job_id is None:
0224     job_mask = lib.JOBDIR
0225 else:
0226     job_mask = []
0227     for job_id in args.job_id:
0228         invalid_id = False
0229         if job_id.startswith("job"): job_mask.append(job_id)
0230         elif job_id.startswith("m"): job_mask.append("job"+job_id)
0231         else:
0232             try:
0233                 job_mask.append(lib.JOBDIR[int(job_id)-1])
0234             except ValueError:
0235                 invalid_id = True
0236             except IndexError:
0237                 print("ID provided to '-j/--job-id' is out of range:", job_id)
0238                 sys.exit(1)
0239 
0240         if invalid_id or job_mask[-1] not in lib.JOBDIR:
0241             print("ID provided to '-j/--job-id' is invalid:", job_id)
0242             print("'-j/--job-id' requires the IDs to exist and to be of either", end=' ')
0243             print("of the following formats:")
0244             print(" - job042")
0245             print(" - 042")
0246             print(" - jobm1")
0247             print(" - m1")
0248             sys.exit(1)
0249 
0250 # build the absolute job directory path (needed by mps_script)
0251 theJobData = os.path.join(os.getcwd(), "jobData")
0252 
0253 # set the job name ???????????????????
0254 theJobName = 'mpalign'
0255 if lib.addFiles != '':
0256     theJobName = lib.addFiles
0257 
0258 fire_htcondor = False
0259 
0260 # fire the 'normal' parallel Jobs (Mille Jobs)
0261 if not args.fireMerge:
0262     #set the resources string coming from mps.db
0263     resources = lib.get_class('mille')
0264 
0265     # "cmscafspec" found in $resources: special cmscaf resources
0266     if 'cmscafspec' in resources:
0267         print('\nWARNING:\n  Running mille jobs on cmscafspec, intended for pede only!\n\n')
0268         resources = '-q cmscafalcamille'
0269     # "cmscaf" found in $resources
0270     elif 'cmscaf' in resources:
0271         # g_cmscaf for ordinary caf queue, keeping 'cmscafspec' free for pede jobs:
0272         resources = '-q'+resources+' -m g_cmscaf'
0273     elif "htcondor" in resources:
0274         fire_htcondor = True
0275         schedinfo = subprocess.check_output(["myschedd","show"]).decode()
0276         if 'cafalca' in resources:
0277             if not 'tzero' in schedinfo:
0278                 print("\nMPS fire: request to use CAF pool which has not been set up. Call `module load lxbatch/tzero` and try again")
0279                 exit(1)
0280         else:
0281             if not 'share' in schedinfo:
0282                 print("\nMPS fire: request to use standard pool when CAF pool is set up. Call `module load lxbatch/share` and try again")
0283                 exit(1)
0284     else:
0285         resources = '-q '+resources
0286 
0287     nSub = 0 # number of submitted Jobs
0288     for i in range(lib.nJobs):
0289         if lib.JOBDIR[i] not in job_mask: continue
0290         if lib.JOBSTATUS[i] == 'SETUP':
0291             if nSub < args.maxJobs:
0292                 if args.forwardProxy:
0293                     forward_proxy(os.path.join(theJobData,lib.JOBDIR[i]))
0294 
0295                 # submit a new job with 'bsub -J ...' and check output
0296                 # for some reasons LSF wants script with full path
0297                 if fire_htcondor:
0298                     Path = os.path.join(theJobData,lib.JOBDIR[i])
0299                     scriptPath = os.path.join(Path, "theScript.sh")
0300                     if args.forwardProxy:
0301                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib,os.path.join(Path,".user_proxy"))
0302                     else:
0303                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib)
0304                     submission = "condor_submit -batch-name %s %s"%\
0305                           (theJobName, job_submit_file)
0306                 else:
0307                     submission = 'bsub -J %s %s %s/%s/theScript.sh' % \
0308                           (theJobName, resources, theJobData, lib.JOBDIR[i])
0309                 print(submission)
0310                 try:
0311                     result = subprocess.check_output(submission,
0312                                                      stderr=subprocess.STDOUT,
0313                                                      shell=True).decode()
0314                 except subprocess.CalledProcessError as e:
0315                     result = "" # -> check for successful job submission will fail
0316                 #print('      '+result, end=' ')
0317                 print(result)
0318                 result = result.strip()
0319 
0320                 # check if job was submitted and updating jobdatabase
0321                 if fire_htcondor:
0322                     match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0323                 else:
0324                     match = re.search('Job <(\d+)> is submitted', result)
0325                 if match:
0326                     # need standard format for job number
0327                     lib.JOBSTATUS[i] = 'SUBTD'
0328                     lib.JOBID[i] = match.group(1)
0329                     if fire_htcondor: lib.JOBID[i] += ".0"
0330                 else:
0331                     print('Submission of %03d seems to have failed: %s' % (lib.JOBNUMBER[i],result), end=' ')
0332                 nSub +=1
0333 
0334         elif args.resub:
0335             if nSub < args.maxJobs:
0336                 if args.forwardProxy:
0337                     forward_proxy(os.path.join(theJobData,lib.JOBDIR[i]))
0338 
0339                 try:
0340                     os.remove("%s/%s/HTCJOB" % (theJobData, lib.JOBDIR[i])) 
0341                 except OSError as e:
0342                     print("Cannot delete file %s/%s/HTCJOB :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0343                 try:
0344                     os.remove("%s/%s/STDOUT" % (theJobData, lib.JOBDIR[i])) 
0345                 except OSError as e:
0346                     print("Cannot delete file %s/%s/STDOUT :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0347                 try:
0348                     os.remove("%s/%s/STDOUT.gz" % (theJobData, lib.JOBDIR[i])) 
0349                 except OSError as e:
0350                     print("Cannot delete file %s/%s/STDOUT.gz :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0351                 try:
0352                     os.remove("%s/%s/alignment.log.gz" % (theJobData, lib.JOBDIR[i])) 
0353                 except OSError as e:
0354                     print("Cannot delete file %s/%s/alignment.log.gz :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0355                 try:
0356                     os.remove("%s/%s/millePedeMonitor%03d.root" % (theJobData, lib.JOBDIR[i], lib.JOBNUMBER[i]))
0357                 except OSError as e:
0358                     print("Cannot delete file %s/%s/millePedeMonitor%03d.root :" % (theJobData,lib.JOBDIR[i],lib.JOBNUMBER[i]), e.strerror)
0359 
0360                 # submit a new job with 'bsub -J ...' and check output
0361                 # for some reasons LSF wants script with full path
0362                 if fire_htcondor:
0363                     Path = os.path.join(theJobData,lib.JOBDIR[i])
0364                     scriptPath = os.path.join(Path, "theScript.sh")
0365                     if args.forwardProxy:
0366                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib,os.path.join(Path,".user_proxy"))
0367                     else:
0368                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib)
0369                     submission = "condor_submit -batch-name %s %s"%\
0370                           (theJobName, job_submit_file)
0371                 else:
0372                     submission = 'bsub -J %s %s %s/%s/theScript.sh' % \
0373                           (theJobName, resources, theJobData, lib.JOBDIR[i])
0374                 print(submission)
0375                 try:
0376                     result = subprocess.check_output(submission,
0377                                                      stderr=subprocess.STDOUT,
0378                                                      shell=True).decode()
0379                 except subprocess.CalledProcessError as e:
0380                     result = "" # -> check for successful job submission will fail
0381                 print('      '+result, end=' ')
0382                 result = result.strip()
0383 
0384                 # check if job was submitted and updating jobdatabase
0385                 if fire_htcondor:
0386                     match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0387                 else:
0388                     match = re.search('Job <(\d+)> is submitted', result)
0389                 if match:
0390                     # need standard format for job number
0391                     lib.JOBSTATUS[i] = 'SUBTD'
0392                     lib.JOBID[i] = match.group(1)
0393                     if fire_htcondor: lib.JOBID[i] += ".0"
0394                 else:
0395                     print('Submission of %03d seems to have failed: %s' % (lib.JOBNUMBER[i],result), end=' ')
0396                 nSub +=1
0397 
0398 
0399 # fire the merge job
0400 else:
0401     print('fire merge')
0402     # set the resources string coming from mps.db
0403     resources = lib.get_class('pede')
0404     if 'cmscafspec' in resources:
0405         resources = '-q cmscafalcamille'
0406     elif "htcondor" in resources:
0407         fire_htcondor = True
0408         schedinfo = subprocess.check_output(["myschedd","show"]).decode()
0409         if 'bigmem' in resources:
0410             if not 'share' in schedinfo:
0411                 print("\nMPS fire: CAF pool is set up, but request to use high-memory machines which live in the standard pool. Call `module load lxbatch/share` and try again")
0412                 exit(1)
0413     else:
0414         resources = '-q '+resources
0415 
0416     if not fire_htcondor:
0417         # Allocate memory for pede job FIXME check documentation for bsub!!!!!
0418         resources = resources+' -R \"rusage[mem="%s"]\"' % str(lib.pedeMem) # FIXME the dots? -> see .pl
0419 
0420     # check whether all other jobs are OK
0421     mergeOK = True
0422     for i in range(lib.nJobs):
0423         if lib.JOBSTATUS[i] != 'OK':
0424             if 'DISABLED' not in lib.JOBSTATUS[i]:
0425                 mergeOK = False
0426                 break
0427 
0428     # loop over merge jobs
0429     i = lib.nJobs
0430     while i<len(lib.JOBDIR):
0431         jobNumFrom1 = i+1
0432         if lib.JOBDIR[i] not in job_mask:
0433             i += 1
0434             continue
0435 
0436         # check if current job in SETUP mode or if forced
0437         if lib.JOBSTATUS[i] != 'SETUP' and not args.resub:
0438             print('Merge job %d status %s not submitted.' % \
0439                   (jobNumFrom1, lib.JOBSTATUS[i]))
0440         elif not (mergeOK or args.forceMerge or args.forceMergeManual):
0441             print('Merge job',jobNumFrom1,'not submitted since Mille jobs error/unfinished (Use -m -f to force).')
0442         elif not args.resub:
0443             # some paths for clarity
0444             Path = os.path.join(theJobData,lib.JOBDIR[i])
0445             backupScriptPath  = os.path.join(Path, "theScript.sh.bak")
0446             scriptPath        = os.path.join(Path, "theScript.sh")
0447 
0448             # force option invoked:
0449             if args.forceMerge:
0450 
0451                 # make a backup copy of the script first, if it doesn't already exist.
0452                 if not os.path.isfile(backupScriptPath):
0453                     os.system('cp -p '+scriptPath+' '+backupScriptPath)
0454 
0455                 # get the name of merge cfg file -> either the.py or alignment_merge.py
0456                 command  = 'cat '+backupScriptPath+' | grep CONFIG_FILE | head -1 | awk -F"/" \'{print $NF}\''
0457                 mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0458                 mergeCfg = mergeCfg.strip()
0459 
0460                 if fire_htcondor:
0461                     job_submit_file = write_HTCondor_submit_file_pede(Path, scriptPath, mergeCfg, lib)
0462 
0463                 # make a backup copy of the cfg
0464                 backupCfgPath  = os.path.join(Path, mergeCfg+".bak")
0465                 cfgPath        = os.path.join(Path, mergeCfg)
0466                 if not os.path.isfile(backupCfgPath):
0467                     os.system('cp -p '+cfgPath+' '+backupCfgPath)
0468 
0469                 # retrieve weights configuration
0470                 with open(os.path.join(Path, ".weights.pkl"), "rb") as f:
0471                     weight_conf = cPickle.load(f)
0472 
0473                 # blank weights
0474                 mps_tools.run_checked(["mps_weight.pl", "-c"])
0475 
0476                 # apply weights
0477                 for name,weight in weight_conf:
0478                     print(" ".join(["mps_weight.pl", "-N", name, weight]))
0479                     mps_tools.run_checked(["mps_weight.pl", "-N", name, weight])
0480 
0481                 # rewrite the mergeCfg using only 'OK' jobs (uses first mille-job as baseconfig)
0482                 inCfgPath = theJobData+'/'+lib.JOBDIR[0]+'/the.py'
0483                 command ='mps_merge.py -w -c '+inCfgPath+' '+Path+'/'+mergeCfg+' '+Path+' '+str(lib.nJobs)
0484                 os.system(command)
0485 
0486                 # rewrite theScript.sh using inly 'OK' jobs
0487                 command = 'mps_scriptm.pl -c '+lib.mergeScript+' '+scriptPath+' '+Path+' '+mergeCfg+' '+str(lib.nJobs)+' '+lib.mssDir+' '+lib.mssDirPool
0488                 os.system(command)
0489 
0490             else:
0491                 # restore the backup copy of the script
0492                 if os.path.isfile(backupScriptPath):
0493                     os.system('cp -pf '+backupScriptPath+' '+scriptPath)
0494 
0495                 # get the name of merge cfg file
0496                 command  = "cat "+scriptPath+" | grep '^\s*CONFIG_FILE' | awk -F'=' '{print $2}'"
0497                 mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0498                 command  = 'basename '+mergeCfg
0499                 mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0500                 mergeCfg = mergeCfg.replace('\n','')
0501 
0502                 if fire_htcondor:
0503                     job_submit_file = write_HTCondor_submit_file_pede(Path, scriptPath, mergeCfg, lib)
0504 
0505                 # restore the backup copy of the cfg
0506                 backupCfgPath  = Path+'/%s.bak' % mergeCfg
0507                 cfgPath        = Path+'/%s'     % mergeCfg
0508                 if os.path.isfile(backupCfgPath):
0509                     os.system('cp -pf '+backupCfgPath+' '+cfgPath)
0510 
0511             # end of if/else forceMerge
0512 
0513             # submit merge job
0514             nMerge = i-lib.nJobs  # 'index' of this merge job
0515             curJobName = 'm'+str(nMerge)+'_'+theJobName
0516             if args.forwardProxy: forward_proxy(Path)
0517             if fire_htcondor:
0518                 submission = ["condor_submit",
0519                               "-batch-name", curJobName,
0520                               job_submit_file]
0521             else:
0522                 submission = ["bsub", "-J", curJobName, resources, scriptPath]
0523             for _ in range(5):
0524                 try:
0525                     result = subprocess.check_output(submission, stderr=subprocess.STDOUT).decode()
0526                     break
0527                 except subprocess.CalledProcessError as e:
0528                     result = e.output
0529 
0530             print('     '+result, end=' ')
0531             result = result.strip()
0532 
0533             # check if merge job was submitted and updating jobdatabase
0534             if fire_htcondor:
0535                 match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0536             else:
0537                 match = re.search('Job <(\d+)> is submitted', result)
0538             if match:
0539                 lib.JOBSTATUS[i] = 'SUBTD'
0540                 lib.JOBID[i] = match.group(1)
0541                 # need standard format for job number
0542                 if fire_htcondor: lib.JOBID[i] += ".0"
0543                 print("jobid is", lib.JOBID[i])
0544             else:
0545                 print('Submission of merge job seems to have failed:',result, end=' ')
0546 
0547         elif args.resub:
0548             # some paths for clarity
0549             Path = os.path.join(theJobData,lib.JOBDIR[i])
0550             dircontents = os.listdir(Path)
0551             for outfile in dircontents:
0552                 if outfile.endswith(".root"):
0553                     os.remove("%s/%s" %(Path, outfile))
0554             try:
0555                 os.remove("%s/HTCJOB" % (Path)) 
0556             except OSError as e:
0557                 print("Cannot delete file %s/HTCJOB :" % (Path), e.strerror)
0558             try:
0559                 os.remove("%s/STDOUT" % (Path)) 
0560             except OSError as e:
0561                 print("Cannot delete file %s/STDOUT :" % (Path), e.strerror)
0562             try:
0563                 os.remove("%s/STDOUT.gz" % (Path)) 
0564             except OSError as e:
0565                 print("Cannot delete file %s/STDOUT.gz :" % (Path), e.strerror)
0566             try:
0567                 os.remove("%s/alignment.log.gz" % (Path)) 
0568             except OSError as e:
0569                 print("Cannot delete file %s/alignment.log.gz :" % (Path), e.strerror)
0570 
0571 
0572             backupScriptPath  = os.path.join(Path, "theScript.sh.bak")
0573             scriptPath        = os.path.join(Path, "theScript.sh")
0574 
0575             # restore the backup copy of the script
0576             if os.path.isfile(backupScriptPath):
0577                 os.system('cp -pf '+backupScriptPath+' '+scriptPath)
0578 
0579             # get the name of merge cfg file
0580             command  = "cat "+scriptPath+" | grep '^\s*CONFIG_FILE' | awk -F'=' '{print $2}'"
0581             mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0582             command  = 'basename '+mergeCfg
0583             mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0584             mergeCfg = mergeCfg.replace('\n','')
0585 
0586             if fire_htcondor:
0587                 job_submit_file = write_HTCondor_submit_file_pede(Path, scriptPath, mergeCfg, lib)
0588 
0589             # restore the backup copy of the cfg
0590             backupCfgPath  = Path+'/%s.bak' % mergeCfg
0591             cfgPath        = Path+'/%s'     % mergeCfg
0592             if os.path.isfile(backupCfgPath):
0593                os.system('cp -pf '+backupCfgPath+' '+cfgPath)
0594 
0595 
0596             # submit merge job
0597             nMerge = i-lib.nJobs  # 'index' of this merge job
0598             curJobName = 'm'+str(nMerge)+'_'+theJobName
0599             if args.forwardProxy: forward_proxy(Path)
0600             if fire_htcondor:
0601                 submission = ["condor_submit",
0602                               "-batch-name", curJobName,
0603                               job_submit_file]
0604             else:
0605                 submission = ["bsub", "-J", curJobName, resources, scriptPath]
0606             for _ in range(5):
0607                 try:
0608                     result = subprocess.check_output(submission, stderr=subprocess.STDOUT).decode()
0609                     break
0610                 except subprocess.CalledProcessError as e:
0611                     result = e.output
0612 
0613             print('     '+result, end=' ')
0614             result = result.strip()
0615 
0616             # check if merge job was submitted and updating jobdatabase
0617             if fire_htcondor:
0618                 match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0619             else:
0620                 match = re.search('Job <(\d+)> is submitted', result)
0621             if match:
0622                 lib.JOBSTATUS[i] = 'SUBTD'
0623                 lib.JOBID[i] = match.group(1)
0624                 # need standard format for job number
0625                 if fire_htcondor: lib.JOBID[i] += ".0"
0626                 print("jobid is", lib.JOBID[i])
0627             else:
0628                 print('Submission of merge job seems to have failed:',result, end=' ')
0629 
0630         i +=1
0631 
0632         # end of while on merge jobs
0633 
0634 
0635 lib.write_db()