Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-25 02:29:03

0001 #!/usr/bin/env python3
0002 #  Submit jobs that are setup in local mps database to batch system
0003 #
0004 #  The bsub sytax: bsub -J 'jobname' -q 'queue name' theProgram
0005 #  The jobname will be something like MP_2015.
0006 #  The queue name is derived from lib.classInfo.
0007 #  The program is theScrip.sh located in each job-directory.
0008 #  There may be the other option -R (see man bsub for info).
0009 #
0010 #  Usage:
0011 #
0012 #  mps_fire.py [-a] [-m [-f]] [maxjobs]
0013 #  mps_fire.py -h
0014 
0015 from builtins import range
0016 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
0017 import Alignment.MillePedeAlignmentAlgorithm.mpslib.tools as mps_tools
0018 import os
0019 import sys
0020 import glob
0021 import shutil
0022 if sys.version_info[0]>2:
0023   import _pickle as cPickle
0024 else:
0025   import cPickle
0026 import subprocess
0027 import re
0028 import argparse
0029 
0030 def forward_proxy(rundir):
0031     """Forward proxy to location visible from the batch system.
0032 
0033     Arguments:
0034     - `rundir`: directory for storing the forwarded proxy
0035     """
0036 
0037     if not mps_tools.check_proxy():
0038         print("Please create proxy via 'voms-proxy-init -voms cms -rfc'.")
0039         sys.exit(1)
0040 
0041     local_proxy = subprocess.check_output(["voms-proxy-info", "--path"]).decode().strip()
0042     shutil.copyfile(local_proxy, os.path.join(rundir,".user_proxy"))
0043 
0044 
0045 def write_HTCondor_submit_file_pede(path, script, config, lib):
0046     """Writes 'job.submit' file in `path`.
0047 
0048     Arguments:
0049     - `path`: job directory
0050     - `script`: script to be executed
0051     - `config`: cfg file
0052     - `lib`: MPS lib object
0053     """
0054 
0055     resources = lib.get_class("pede").split("_")[1:] # strip off 'htcondor'
0056     job_flavour = resources[-1]
0057 
0058     job_submit_template="""\
0059 universe              = vanilla
0060 executable            = {script:s}
0061 output                = {jobm:s}/STDOUT
0062 error                 = {jobm:s}/STDOUT
0063 log                   = {jobm:s}/HTCJOB
0064 notification          = Always
0065 transfer_output_files = ""
0066 request_memory        = {pedeMem:d}M
0067 
0068 # adapted to space used on eos for binaries:
0069 request_disk          = {disk:d}
0070 
0071 # adapted to threads parameter in pede options and number of available cores
0072 request_cpus          = {cpus:d}
0073 
0074 +JobFlavour           = "{flavour:s}"
0075 """
0076     if "bigmem" in resources:
0077         job_submit_template += """\
0078 +BigMemJob            = True
0079 +AccountingGroup      = "group_u_CMS.e_cms_caf_bigmem"
0080 
0081 # automatically remove the job if the submitter has no permissions to run a BigMemJob
0082 periodic_remove       = !regexp("group_u_CMS.e_cms_caf_bigmem", AccountingGroup) && BigMemJob =?= True
0083 """
0084     job_submit_template += "\nqueue\n"
0085 
0086     print("Determine number of pede threads...")
0087     cms_process = mps_tools.get_process_object(os.path.join(Path, mergeCfg))
0088     pede_options = cms_process.AlignmentProducer.algoConfig.pedeSteerer.options.value()
0089     n_threads = 1
0090     for option in pede_options:
0091         if "threads" in option:
0092             n_threads = option.replace("threads", "").strip()
0093             n_threads = max(map(lambda x: int(x), n_threads.split()))
0094             break
0095     if n_threads > 16: n_threads = 16 # HTCondor machines have (currently) 16
0096                                       # cores, i.e. we ensure here that the job
0097                                       # would fit core-wise on one machine
0098 
0099     print("Determine required disk space on remote host...")
0100     # determine usage by each file instead of whole directory as this is what
0101     # matters for the specified disk usage:
0102     spco = subprocess.check_output # to make code below more less verbose
0103     opj = os.path.join             # dito
0104     cmd = ["du", "--apparent-size"]
0105     disk_usage = [int(item.split()[0])
0106                   for directory in ("binaries", "monitors", "tree_files")
0107                   for item
0108                   in spco(cmd+
0109                           glob.glob(opj(lib.mssDir, directory, "*"))).decode().splitlines()]
0110     disk_usage = sum(disk_usage)
0111     disk_usage *= 1.1 # reserve 10% additional space
0112 
0113     job_submit_file = os.path.join(Path, "job.submit")
0114     with open(job_submit_file, "w") as f:
0115         f.write(job_submit_template.format(script = os.path.abspath(script),
0116                                            jobm = os.path.abspath(path),
0117                                            pedeMem = lib.pedeMem,
0118                                            disk = int(disk_usage),
0119                                            cpus = n_threads,
0120                                            flavour = job_flavour))
0121 
0122     return job_submit_file
0123 
0124 def write_HTCondor_submit_file_mille(path, script, lib, proxy_path=None):
0125     """Writes 'job.submit' file in `path`.
0126 
0127     Arguments:
0128     - `path`: job directory
0129     - `script`: script to be executed
0130     - `lib`: MPS lib object
0131     - `proxy_path`: path to proxy (only used in case of requested proxy forward)
0132     """
0133 
0134     resources = lib.get_class("mille").split("_")[1:] # strip off 'htcondor'
0135     job_flavour = resources[-1]
0136 
0137     job_submit_template="""\
0138 universe              = vanilla
0139 executable            = {script:s}
0140 output                = {jobm:s}/STDOUT
0141 error                 = {jobm:s}/STDOUT
0142 log                   = {jobm:s}/HTCJOB
0143 notification          = Always
0144 transfer_output_files = ""
0145 
0146 +JobFlavour           = "{flavour:s}"
0147 """
0148     if "cafalca" in resources:
0149         job_submit_template += """\
0150 +CAFJob              = True
0151 +AccountingGroup     = "group_u_CMS.CAF.ALCA"
0152 # automatically remove the job if the submitter has no permissions to run a CAF Job
0153 periodic_remove       = !regexp("group_u_CMS.CAF.ALCA", AccountingGroup) && CAFJob =?= True
0154 """
0155 
0156     if proxy_path is not None:
0157         job_submit_template += """\
0158 +x509userproxy        = "{proxy:s}"
0159 """
0160     job_submit_template += "\nqueue\n"
0161 
0162     job_submit_file = os.path.join(Path, "job.submit")
0163     with open(job_submit_file, "w") as f:
0164         f.write(job_submit_template.format(script = os.path.abspath(script),
0165                                            jobm = os.path.abspath(path),
0166                                            flavour = job_flavour,
0167                                            proxy = proxy_path))
0168 
0169     return job_submit_file
0170 
0171 
0172 
0173 parser = argparse.ArgumentParser(
0174         description="Submit jobs that are setup in local mps database to batch system.",
0175 )
0176 parser.add_argument("maxJobs", type=int, nargs='?', default=1,
0177                     help="number of Mille jobs to be submitted (default: %(default)d)")
0178 parser.add_argument("-j", "--job-id", dest = "job_id", nargs = "*",
0179                     help = ("job IDs to be submitted; "
0180                             "use either 'job<ID>' or directly '<ID>'"))
0181 parser.add_argument("-r", "--resubmit", dest = "resub", default=False, 
0182                     action="store_true",
0183                     help = ("resubmit jobs - only works if job IDs are specified"))
0184 parser.add_argument("-a", "--all", dest="allMille", default=False,
0185                     action="store_true",
0186                     help = ("submit all setup Mille jobs; "
0187                             "maxJobs and --job-id are ignored"))
0188 parser.add_argument("-m", "--merge", dest="fireMerge", default=False,
0189                     action="store_true",
0190                     help = ("submit all setup Pede jobs; "
0191                             "maxJobs is ignored, but --job-id is respected"))
0192 parser.add_argument("-f", "--force-merge", dest="forceMerge", default=False,
0193                     action="store_true",
0194                     help=("force the submission of the Pede job in case some "+
0195                           "Mille jobs are not in the OK state"))
0196 parser.add_argument("--force-merge-manual", dest="forceMergeManual", default=False,
0197                     action="store_true",
0198                     help=("force the submission of the Pede job in case some "+
0199                           "Mille jobs are not in the OK state. Unlike --forceMerge "+
0200                           "this option assumes the user has edited theScript.sh and "+
0201                           "alignment_merge.py to consistently pick up only the mille "+
0202                           "output files that exist"))
0203 parser.add_argument("-p", "--forward-proxy", dest="forwardProxy", default=False,
0204                     action="store_true",
0205                     help="forward VOMS proxy to batch system")
0206 args = parser.parse_args(sys.argv[1:])
0207 
0208 
0209 lib = mpslib.jobdatabase()
0210 lib.read_db()
0211 
0212 if args.allMille:
0213     # submit all Mille jobs and ignore 'maxJobs' supplied by user
0214     args.maxJobs = lib.nJobs
0215     args.job_id = None
0216 
0217 if args.job_id is None and args.resub:
0218     print("Can only resubmit jobs if job IDs are specified")
0219     sys.exit(1)
0220 
0221 
0222 if args.job_id is None:
0223     job_mask = lib.JOBDIR
0224 else:
0225     job_mask = []
0226     for job_id in args.job_id:
0227         invalid_id = False
0228         if job_id.startswith("job"): job_mask.append(job_id)
0229         elif job_id.startswith("m"): job_mask.append("job"+job_id)
0230         else:
0231             try:
0232                 job_mask.append(lib.JOBDIR[int(job_id)-1])
0233             except ValueError:
0234                 invalid_id = True
0235             except IndexError:
0236                 print("ID provided to '-j/--job-id' is out of range:", job_id)
0237                 sys.exit(1)
0238 
0239         if invalid_id or job_mask[-1] not in lib.JOBDIR:
0240             print("ID provided to '-j/--job-id' is invalid:", job_id)
0241             print("'-j/--job-id' requires the IDs to exist and to be of either", end=' ')
0242             print("of the following formats:")
0243             print(" - job042")
0244             print(" - 042")
0245             print(" - jobm1")
0246             print(" - m1")
0247             sys.exit(1)
0248 
0249 # build the absolute job directory path (needed by mps_script)
0250 theJobData = os.path.join(os.getcwd(), "jobData")
0251 
0252 # set the job name ???????????????????
0253 theJobName = 'mpalign'
0254 if lib.addFiles != '':
0255     theJobName = lib.addFiles
0256 
0257 fire_htcondor = False
0258 
0259 # fire the 'normal' parallel Jobs (Mille Jobs)
0260 if not args.fireMerge:
0261     #set the resources string coming from mps.db
0262     resources = lib.get_class('mille')
0263 
0264     # "cmscafspec" found in $resources: special cmscaf resources
0265     if 'cmscafspec' in resources:
0266         print('\nWARNING:\n  Running mille jobs on cmscafspec, intended for pede only!\n\n')
0267         resources = '-q cmscafalcamille'
0268     # "cmscaf" found in $resources
0269     elif 'cmscaf' in resources:
0270         # g_cmscaf for ordinary caf queue, keeping 'cmscafspec' free for pede jobs:
0271         resources = '-q'+resources+' -m g_cmscaf'
0272     elif "htcondor" in resources:
0273         fire_htcondor = True
0274         schedinfo = subprocess.check_output(["myschedd","show"]).decode()
0275         if 'cafalca' in resources:
0276             if not 'tzero' in schedinfo:
0277                 print("\nMPS fire: request to use CAF pool which has not been set up. Call `module load lxbatch/tzero` and try again")
0278                 exit(1)
0279         else:
0280             if not 'share' in schedinfo:
0281                 print("\nMPS fire: request to use standard pool when CAF pool is set up. Call `module load lxbatch/share` and try again")
0282                 exit(1)
0283     else:
0284         resources = '-q '+resources
0285 
0286     nSub = 0 # number of submitted Jobs
0287     for i in range(lib.nJobs):
0288         if lib.JOBDIR[i] not in job_mask: continue
0289         if lib.JOBSTATUS[i] == 'SETUP':
0290             if nSub < args.maxJobs:
0291                 if args.forwardProxy:
0292                     forward_proxy(os.path.join(theJobData,lib.JOBDIR[i]))
0293 
0294                 # submit a new job with 'bsub -J ...' and check output
0295                 # for some reasons LSF wants script with full path
0296                 if fire_htcondor:
0297                     Path = os.path.join(theJobData,lib.JOBDIR[i])
0298                     scriptPath = os.path.join(Path, "theScript.sh")
0299                     if args.forwardProxy:
0300                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib,os.path.join(Path,".user_proxy"))
0301                     else:
0302                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib)
0303                     submission = "condor_submit -batch-name %s %s"%\
0304                           (theJobName, job_submit_file)
0305                 else:
0306                     submission = 'bsub -J %s %s %s/%s/theScript.sh' % \
0307                           (theJobName, resources, theJobData, lib.JOBDIR[i])
0308                 print(submission)
0309                 try:
0310                     result = subprocess.check_output(submission,
0311                                                      stderr=subprocess.STDOUT,
0312                                                      shell=True).decode()
0313                 except subprocess.CalledProcessError as e:
0314                     result = "" # -> check for successful job submission will fail
0315                 #print('      '+result, end=' ')
0316                 print(result)
0317                 result = result.strip()
0318 
0319                 # check if job was submitted and updating jobdatabase
0320                 if fire_htcondor:
0321                     match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0322                 else:
0323                     match = re.search('Job <(\d+)> is submitted', result)
0324                 if match:
0325                     # need standard format for job number
0326                     lib.JOBSTATUS[i] = 'SUBTD'
0327                     lib.JOBID[i] = match.group(1)
0328                     if fire_htcondor: lib.JOBID[i] += ".0"
0329                 else:
0330                     print('Submission of %03d seems to have failed: %s' % (lib.JOBNUMBER[i],result), end=' ')
0331                 nSub +=1
0332 
0333         elif args.resub:
0334             if nSub < args.maxJobs:
0335                 if args.forwardProxy:
0336                     forward_proxy(os.path.join(theJobData,lib.JOBDIR[i]))
0337 
0338                 try:
0339                     os.remove("%s/%s/HTCJOB" % (theJobData, lib.JOBDIR[i])) 
0340                 except OSError as e:
0341                     print("Cannot delete file %s/%s/HTCJOB :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0342                 try:
0343                     os.remove("%s/%s/STDOUT" % (theJobData, lib.JOBDIR[i])) 
0344                 except OSError as e:
0345                     print("Cannot delete file %s/%s/STDOUT :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0346                 try:
0347                     os.remove("%s/%s/STDOUT.gz" % (theJobData, lib.JOBDIR[i])) 
0348                 except OSError as e:
0349                     print("Cannot delete file %s/%s/STDOUT.gz :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0350                 try:
0351                     os.remove("%s/%s/alignment.log.gz" % (theJobData, lib.JOBDIR[i])) 
0352                 except OSError as e:
0353                     print("Cannot delete file %s/%s/alignment.log.gz :" % (theJobData,lib.JOBDIR[i]), e.strerror)
0354                 try:
0355                     os.remove("%s/%s/millePedeMonitor%03d.root" % (theJobData, lib.JOBDIR[i], lib.JOBNUMBER[i]))
0356                 except OSError as e:
0357                     print("Cannot delete file %s/%s/millePedeMonitor%03d.root :" % (theJobData,lib.JOBDIR[i],lib.JOBNUMBER[i]), e.strerror)
0358 
0359                 # submit a new job with 'bsub -J ...' and check output
0360                 # for some reasons LSF wants script with full path
0361                 if fire_htcondor:
0362                     Path = os.path.join(theJobData,lib.JOBDIR[i])
0363                     scriptPath = os.path.join(Path, "theScript.sh")
0364                     if args.forwardProxy:
0365                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib,os.path.join(Path,".user_proxy"))
0366                     else:
0367                         job_submit_file = write_HTCondor_submit_file_mille(Path, scriptPath, lib)
0368                     submission = "condor_submit -batch-name %s %s"%\
0369                           (theJobName, job_submit_file)
0370                 else:
0371                     submission = 'bsub -J %s %s %s/%s/theScript.sh' % \
0372                           (theJobName, resources, theJobData, lib.JOBDIR[i])
0373                 print(submission)
0374                 try:
0375                     result = subprocess.check_output(submission,
0376                                                      stderr=subprocess.STDOUT,
0377                                                      shell=True).decode()
0378                 except subprocess.CalledProcessError as e:
0379                     result = "" # -> check for successful job submission will fail
0380                 print('      '+result, end=' ')
0381                 result = result.strip()
0382 
0383                 # check if job was submitted and updating jobdatabase
0384                 if fire_htcondor:
0385                     match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0386                 else:
0387                     match = re.search('Job <(\d+)> is submitted', result)
0388                 if match:
0389                     # need standard format for job number
0390                     lib.JOBSTATUS[i] = 'SUBTD'
0391                     lib.JOBID[i] = match.group(1)
0392                     if fire_htcondor: lib.JOBID[i] += ".0"
0393                 else:
0394                     print('Submission of %03d seems to have failed: %s' % (lib.JOBNUMBER[i],result), end=' ')
0395                 nSub +=1
0396 
0397 
0398 # fire the merge job
0399 else:
0400     print('fire merge')
0401     # set the resources string coming from mps.db
0402     resources = lib.get_class('pede')
0403     if 'cmscafspec' in resources:
0404         resources = '-q cmscafalcamille'
0405     elif "htcondor" in resources:
0406         fire_htcondor = True
0407         schedinfo = subprocess.check_output(["myschedd","show"]).decode()
0408         if 'bigmem' in resources:
0409             if not 'share' in schedinfo:
0410                 print("\nMPS fire: CAF pool is set up, but request to use high-memory machines which live in the standard pool. Call `module load lxbatch/share` and try again")
0411                 exit(1)
0412     else:
0413         resources = '-q '+resources
0414 
0415     if not fire_htcondor:
0416         # Allocate memory for pede job FIXME check documentation for bsub!!!!!
0417         resources = resources+' -R \"rusage[mem="%s"]\"' % str(lib.pedeMem) # FIXME the dots? -> see .pl
0418 
0419     # check whether all other jobs are OK
0420     mergeOK = True
0421     for i in range(lib.nJobs):
0422         if lib.JOBSTATUS[i] != 'OK':
0423             if 'DISABLED' not in lib.JOBSTATUS[i]:
0424                 mergeOK = False
0425                 break
0426 
0427     # loop over merge jobs
0428     i = lib.nJobs
0429     while i<len(lib.JOBDIR):
0430         jobNumFrom1 = i+1
0431         if lib.JOBDIR[i] not in job_mask:
0432             i += 1
0433             continue
0434 
0435         # check if current job in SETUP mode or if forced
0436         if lib.JOBSTATUS[i] != 'SETUP' and not args.resub:
0437             print('Merge job %d status %s not submitted.' % \
0438                   (jobNumFrom1, lib.JOBSTATUS[i]))
0439         elif not (mergeOK or args.forceMerge or args.forceMergeManual):
0440             print('Merge job',jobNumFrom1,'not submitted since Mille jobs error/unfinished (Use -m -f to force).')
0441         elif not args.resub:
0442             # some paths for clarity
0443             Path = os.path.join(theJobData,lib.JOBDIR[i])
0444             backupScriptPath  = os.path.join(Path, "theScript.sh.bak")
0445             scriptPath        = os.path.join(Path, "theScript.sh")
0446 
0447             # force option invoked:
0448             if args.forceMerge:
0449 
0450                 # make a backup copy of the script first, if it doesn't already exist.
0451                 if not os.path.isfile(backupScriptPath):
0452                     os.system('cp -p '+scriptPath+' '+backupScriptPath)
0453 
0454                 # get the name of merge cfg file -> either the.py or alignment_merge.py
0455                 command  = 'cat '+backupScriptPath+' | grep CONFIG_FILE | head -1 | awk -F"/" \'{print $NF}\''
0456                 mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0457                 mergeCfg = mergeCfg.strip()
0458 
0459                 if fire_htcondor:
0460                     job_submit_file = write_HTCondor_submit_file_pede(Path, scriptPath, mergeCfg, lib)
0461 
0462                 # make a backup copy of the cfg
0463                 backupCfgPath  = os.path.join(Path, mergeCfg+".bak")
0464                 cfgPath        = os.path.join(Path, mergeCfg)
0465                 if not os.path.isfile(backupCfgPath):
0466                     os.system('cp -p '+cfgPath+' '+backupCfgPath)
0467 
0468                 # retrieve weights configuration
0469                 with open(os.path.join(Path, ".weights.pkl"), "rb") as f:
0470                     weight_conf = cPickle.load(f)
0471 
0472                 # blank weights
0473                 mps_tools.run_checked(["mps_weight.pl", "-c"])
0474 
0475                 # apply weights
0476                 for name,weight in weight_conf:
0477                     print(" ".join(["mps_weight.pl", "-N", name, weight]))
0478                     mps_tools.run_checked(["mps_weight.pl", "-N", name, weight])
0479 
0480                 # rewrite the mergeCfg using only 'OK' jobs (uses first mille-job as baseconfig)
0481                 inCfgPath = theJobData+'/'+lib.JOBDIR[0]+'/the.py'
0482                 command ='mps_merge.py -w -c '+inCfgPath+' '+Path+'/'+mergeCfg+' '+Path+' '+str(lib.nJobs)
0483                 os.system(command)
0484 
0485                 # rewrite theScript.sh using inly 'OK' jobs
0486                 command = 'mps_scriptm.pl -c '+lib.mergeScript+' '+scriptPath+' '+Path+' '+mergeCfg+' '+str(lib.nJobs)+' '+lib.mssDir+' '+lib.mssDirPool
0487                 os.system(command)
0488 
0489             else:
0490                 # restore the backup copy of the script
0491                 if os.path.isfile(backupScriptPath):
0492                     os.system('cp -pf '+backupScriptPath+' '+scriptPath)
0493 
0494                 # get the name of merge cfg file
0495                 command  = "cat "+scriptPath+" | grep '^\s*CONFIG_FILE' | awk -F'=' '{print $2}'"
0496                 mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0497                 command  = 'basename '+mergeCfg
0498                 mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0499                 mergeCfg = mergeCfg.replace('\n','')
0500 
0501                 if fire_htcondor:
0502                     job_submit_file = write_HTCondor_submit_file_pede(Path, scriptPath, mergeCfg, lib)
0503 
0504                 # restore the backup copy of the cfg
0505                 backupCfgPath  = Path+'/%s.bak' % mergeCfg
0506                 cfgPath        = Path+'/%s'     % mergeCfg
0507                 if os.path.isfile(backupCfgPath):
0508                     os.system('cp -pf '+backupCfgPath+' '+cfgPath)
0509 
0510             # end of if/else forceMerge
0511 
0512             # submit merge job
0513             nMerge = i-lib.nJobs  # 'index' of this merge job
0514             curJobName = 'm'+str(nMerge)+'_'+theJobName
0515             if args.forwardProxy: forward_proxy(Path)
0516             if fire_htcondor:
0517                 submission = ["condor_submit",
0518                               "-batch-name", curJobName,
0519                               job_submit_file]
0520             else:
0521                 submission = ["bsub", "-J", curJobName, resources, scriptPath]
0522             for _ in range(5):
0523                 try:
0524                     result = subprocess.check_output(submission, stderr=subprocess.STDOUT).decode()
0525                     break
0526                 except subprocess.CalledProcessError as e:
0527                     result = e.output
0528 
0529             print('     '+result, end=' ')
0530             result = result.strip()
0531 
0532             # check if merge job was submitted and updating jobdatabase
0533             if fire_htcondor:
0534                 match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0535             else:
0536                 match = re.search('Job <(\d+)> is submitted', result)
0537             if match:
0538                 lib.JOBSTATUS[i] = 'SUBTD'
0539                 lib.JOBID[i] = match.group(1)
0540                 # need standard format for job number
0541                 if fire_htcondor: lib.JOBID[i] += ".0"
0542                 print("jobid is", lib.JOBID[i])
0543             else:
0544                 print('Submission of merge job seems to have failed:',result, end=' ')
0545 
0546         elif args.resub:
0547             # some paths for clarity
0548             Path = os.path.join(theJobData,lib.JOBDIR[i])
0549             dircontents = os.listdir(Path)
0550             for outfile in dircontents:
0551                 if outfile.endswith(".root"):
0552                     os.remove("%s/%s" %(Path, outfile))
0553             try:
0554                 os.remove("%s/HTCJOB" % (Path)) 
0555             except OSError as e:
0556                 print("Cannot delete file %s/HTCJOB :" % (Path), e.strerror)
0557             try:
0558                 os.remove("%s/STDOUT" % (Path)) 
0559             except OSError as e:
0560                 print("Cannot delete file %s/STDOUT :" % (Path), e.strerror)
0561             try:
0562                 os.remove("%s/STDOUT.gz" % (Path)) 
0563             except OSError as e:
0564                 print("Cannot delete file %s/STDOUT.gz :" % (Path), e.strerror)
0565             try:
0566                 os.remove("%s/alignment.log.gz" % (Path)) 
0567             except OSError as e:
0568                 print("Cannot delete file %s/alignment.log.gz :" % (Path), e.strerror)
0569 
0570 
0571             backupScriptPath  = os.path.join(Path, "theScript.sh.bak")
0572             scriptPath        = os.path.join(Path, "theScript.sh")
0573 
0574             # restore the backup copy of the script
0575             if os.path.isfile(backupScriptPath):
0576                 os.system('cp -pf '+backupScriptPath+' '+scriptPath)
0577 
0578             # get the name of merge cfg file
0579             command  = "cat "+scriptPath+" | grep '^\s*CONFIG_FILE' | awk -F'=' '{print $2}'"
0580             mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0581             command  = 'basename '+mergeCfg
0582             mergeCfg = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True).decode()
0583             mergeCfg = mergeCfg.replace('\n','')
0584 
0585             if fire_htcondor:
0586                 job_submit_file = write_HTCondor_submit_file_pede(Path, scriptPath, mergeCfg, lib)
0587 
0588             # restore the backup copy of the cfg
0589             backupCfgPath  = Path+'/%s.bak' % mergeCfg
0590             cfgPath        = Path+'/%s'     % mergeCfg
0591             if os.path.isfile(backupCfgPath):
0592                os.system('cp -pf '+backupCfgPath+' '+cfgPath)
0593 
0594 
0595             # submit merge job
0596             nMerge = i-lib.nJobs  # 'index' of this merge job
0597             curJobName = 'm'+str(nMerge)+'_'+theJobName
0598             if args.forwardProxy: forward_proxy(Path)
0599             if fire_htcondor:
0600                 submission = ["condor_submit",
0601                               "-batch-name", curJobName,
0602                               job_submit_file]
0603             else:
0604                 submission = ["bsub", "-J", curJobName, resources, scriptPath]
0605             for _ in range(5):
0606                 try:
0607                     result = subprocess.check_output(submission, stderr=subprocess.STDOUT).decode()
0608                     break
0609                 except subprocess.CalledProcessError as e:
0610                     result = e.output
0611 
0612             print('     '+result, end=' ')
0613             result = result.strip()
0614 
0615             # check if merge job was submitted and updating jobdatabase
0616             if fire_htcondor:
0617                 match = re.search(r"1 job\(s\) submitted to cluster (\d+)\.", result)
0618             else:
0619                 match = re.search('Job <(\d+)> is submitted', result)
0620             if match:
0621                 lib.JOBSTATUS[i] = 'SUBTD'
0622                 lib.JOBID[i] = match.group(1)
0623                 # need standard format for job number
0624                 if fire_htcondor: lib.JOBID[i] += ".0"
0625                 print("jobid is", lib.JOBID[i])
0626             else:
0627                 print('Submission of merge job seems to have failed:',result, end=' ')
0628 
0629         i +=1
0630 
0631         # end of while on merge jobs
0632 
0633 
0634 lib.write_db()