Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-05-18 03:26:52

0001 #!/usr/bin/env python3
0002 #
0003 #  This script checks outputs from jobs that have FETCH status and updates if errors occured
0004 #  -> check STDOUT files
0005 #  -> check cmsRun.out
0006 #  -> check alignment.log
0007 #  -> check if millebinaries are on eos
0008 #  -> check pede.dump
0009 #  -> check millepede.log
0010 #  -> check millepede.end
0011 #
0012 #  It also retirieves number of events from alignment.log and cputime from STDOUT
0013 
0014 from __future__ import print_function
0015 from builtins import range
0016 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
0017 import subprocess
0018 import re
0019 import os
0020 
0021 lib = mpslib.jobdatabase()
0022 lib.read_db()
0023 
0024 # create a list of eos ls entries containing files on eos binary store
0025 command = ["ls", "-l", os.path.join(lib.mssDir, "binaries")]
0026 try:
0027     eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).decode().split('\n')
0028 except subprocess.CalledProcessError:
0029     eoslsoutput = ""
0030 
0031 # loop over FETCH jobs
0032 for i in range(len(lib.JOBID)):
0033     # FIXME use bools?
0034     batchSuccess = 0
0035     batchExited = 0
0036     finished = 0
0037     endofjob = 0
0038     eofile = 1  # do not deal with timel yet
0039     timel = 0
0040     killed = 0
0041     ioprob = 0
0042     fw8001 = 0
0043     tooManyTracks = 0
0044     segviol = 0
0045     rfioerr = 0
0046     quota = 0
0047     nEvent = 0
0048     cputime = -1
0049     pedeAbend = 0
0050     pedeLogErr = 0
0051     pedeLogWrn = 0
0052     exceptionCaught = 0
0053     timeout = 0
0054     cfgerr = 0
0055     emptyDatErr = 0
0056     emptyDatOnFarm = 0
0057     cmdNotFound = 0
0058     insuffPriv = 0
0059     quotaspace = 0
0060     copyerr=0
0061     ispede=0
0062 
0063     kill_reason = None
0064     pedeLogErrStr = ""
0065     pedeLogWrnStr = ""
0066     remark = ""
0067 
0068     disabled = "";
0069     if 'DISABLED' in lib.JOBSTATUS[i]:
0070         disabled = 'DISABLED'
0071 
0072     if 'FETCH' in lib.JOBSTATUS[i]:
0073 
0074         # open the STDOUT file
0075         stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
0076         # unzip the STDOUT file if necessary
0077         if os.access(stdOut+'.gz', os.R_OK):
0078             os.system('gunzip '+stdOut+'.gz')
0079 
0080         try:
0081             with open(stdOut, "r") as STDFILE:
0082                 # scan records in input file.
0083                 # use regular expression to search. re.compile needed for options re.M and re.I
0084                 # re.M=re.MULTILINE enables matching of newline char
0085                 # re.I=re.IGNORECASE makes matching case-insensitive.
0086                 for line in STDFILE:
0087                     if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
0088                         quotaspace = 1
0089                     if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
0090                         quotaspace = 1
0091                     if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
0092                         quotaspace = 1
0093                     if re.search(re.compile('CERN report: Job Killed',re.M), line):
0094                         killed = 1
0095                     if re.search(re.compile('Job finished',re.M), line):
0096                         finished = 1
0097                     if re.search(re.compile('connection timed out',re.M), line):
0098                         timeout = 1
0099                     if re.search(re.compile('ConfigFileReadError',re.M), line):
0100                         cfgerr = 1
0101                     if re.search(re.compile('0 bytes transferred',re.M), line):
0102                         emptyDatOnFarm = 1
0103                     if re.search(re.compile('command not found',re.M), line):
0104                         cmdNotFound = 1
0105                     # AP 26.11.2009 Insufficient privileges to rfcp files
0106                     if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
0107                         insuffPriv = 1
0108                     if re.search(re.compile('Give up doing',re.M), line):
0109                         copyerr = 1
0110                     if re.search(re.compile('Directory content before',re.M),line):
0111                         ispede = 1
0112                     # AP 05.11.2015 Extract cpu-time.
0113                     # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
0114                     # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
0115                     # CPU time from email.
0116                     match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
0117                     if match:
0118                         cpuFactor = 2.125
0119                         cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
0120 
0121             # gzip it afterwards:
0122             print('gzip -f '+stdOut)
0123             os.system('gzip -f '+stdOut)
0124         except IOError as e:
0125             if e.args == (2, "No such file or directory"):
0126                 print("mps_check.py cannot find", stdOut, "to test")
0127             else:
0128                 raise
0129 
0130         # check HTCondor log file
0131         try:
0132             log_file = os.path.join("jobData", lib.JOBDIR[i], "HTCJOB")
0133             condor_log = subprocess.check_output(["condor_q", lib.JOBID[i],
0134                                                   "-userlog", log_file,
0135                                                   "-af",
0136                                                   "RemoteSysCpu",
0137                                                   "JobStatus",
0138                                                   "RemoveReason"],
0139                                                  stderr = subprocess.STDOUT).decode()
0140             condor_log = condor_log.split()
0141 
0142             cputime = int(round(float(condor_log[0])))
0143 
0144             if condor_log[1] == "3": # JobStatus == Removed
0145                 killed = 1
0146                 kill_reason = " ".join(condor_log[2:])
0147 
0148         except subprocess.CalledProcessError as e:
0149             pass
0150 
0151 
0152         # GF: This file is not produced (anymore...) -> check for existence and read-access added
0153         eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
0154         if os.access(eazeLog, os.R_OK):
0155             # open the input file
0156             with open(eazeLog, "r") as INFILE:
0157                 # scan records in input file
0158                 for line in INFILE:
0159                     # check if end of file has been reached
0160                     if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0161                         eofile = 1
0162                     if re.search(re.compile('Time limit reached\.',re.M), line):
0163                         timel = 1
0164                     if re.search(re.compile('gives I\/O problem',re.M), line):
0165                         ioprob = 1
0166                     if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0167                         fw8001 = 1
0168                     if re.search(re.compile('too many tracks',re.M), line):
0169                         tooManyTracks = 1
0170                     if re.search(re.compile('segmentation violation',re.M), line):
0171                         segviol = 1
0172                     if re.search(re.compile('failed RFIO error',re.M), line):
0173                         rfioerr = 1
0174                     if re.search(re.compile('Request exceeds quota',re.M), line):
0175                         quota = 1
0176 
0177         # if there is an alignment.log[.gz] file, check it as well
0178         eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
0179         logZipped = 'no'
0180         # unzip the logfile if necessary
0181         if os.access(eazeLog+'.gz', os.R_OK):
0182             os.system('gunzip '+eazeLog+'.gz')
0183             logZipped = 'true'
0184 
0185         if os.access(eazeLog, os.R_OK):   # access to alignment.log
0186             # open the input file
0187             with open(eazeLog,'r') as INFILE:
0188                 # scan records in input file
0189                 for line in INFILE:
0190                     # check if end of file has been reached
0191                     if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0192                         eofile = 1
0193                     if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
0194                         timel = 1
0195                     if re.search(re.compile('GAF gives I\/O problem',re.M), line):
0196                         ioprob = 1
0197                     if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0198                         fw8001 = 1
0199                     if re.search(re.compile('too many tracks',re.M), line):
0200                         tooManyTracks = 1
0201                     if re.search(re.compile('segmentation violation',re.M), line):
0202                         segviol = 1
0203                     if re.search(re.compile('failed RFIO error',re.M), line):
0204                         rfioerr = 1
0205                     if re.search(re.compile('Request exceeds quota',re.M), line):
0206                         quota = 1
0207                     # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
0208                     if re.search(re.compile('Fatal Exception',re.M), line):
0209                         exceptionCaught = 1
0210                     if re.search(re.compile('Exception caught in cmsRun',re.M), line):
0211                         exceptionCaught = 1
0212                     # AP 07.09.2009 - Check that the job got to a normal end
0213                     if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
0214                         endofjob = 1
0215                     if re.search(re.compile('FwkReport            -i main_input:sourc',re.M), line):
0216                         array = line.split()
0217                         nEvent = int(array[5])
0218                     if nEvent==0 and re.search(re.compile('FwkReport            -i PostSource',re.M), line):
0219                         array = line.split()
0220                         nEvent = int(array[5])
0221                     # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
0222                     if nEvent==0 and re.search(re.compile('FwkReport            -i AfterSource',re.M), line):
0223                         array = line.split()
0224                         nEvent = int(array[5])
0225 
0226             if logZipped == 'true':
0227                 os.system('gzip -f '+eazeLog)
0228 
0229         else:   # no access to alignment.log
0230             print('mps_check.py cannot find',eazeLog,'to test')
0231             # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
0232             endofjob = 1
0233 
0234         # for mille jobs checks that milleBinary file is not empty
0235         if i<lib.nJobs:  # mille job!
0236             milleOut = 'milleBinary%03d.dat' % (i+1)
0237             # from Perl, should be deprecated because of cmsls and nsls
0238             #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
0239             #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
0240             #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
0241             mOutSize = 0
0242             #print(">>>eoslsoutput:", eoslsoutput, " \ttype(eoslsoutput):", type(eoslsoutput))
0243             for line in eoslsoutput:
0244                 if milleOut in line:
0245                     columns = line.split()
0246                     mOutSize = int(columns[4]) # 5th column = size
0247             if not (mOutSize>0):
0248                 emptyDatErr = 1
0249 
0250         # merge jobs: additional checks for merging job
0251         else:
0252             # if there is a pede.dump file check it as well
0253             eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
0254             if os.access(eazeLog+'.gz', os.R_OK):
0255                 # unzip - but clean before and save to tmp
0256                 os.system('rm -f /tmp/pede.dump')
0257                 os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
0258                 eazeLog = '/tmp/pede.dump'
0259             if os.access(eazeLog, os.R_OK):
0260                 with open(eazeLog, "r") as INFILE: # open pede.dump
0261                     # scan records in INFILE
0262                     pedeAbend = 1
0263                     usedPedeMem = 0.
0264                     for line in INFILE:
0265                         # check if pede has reached its normal end
0266                         if re.search(re.compile('Millepede II.* ending',re.M), line):
0267                             pedeAbend = 0
0268                         # extract memory usage
0269                         match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
0270                         if match:
0271                             mem = match.group(1)
0272                             mem = re.sub('\s', '', mem)
0273                             # if mem is a float
0274                             if re.search(re.compile('^\d+\.\d+$',re.M), mem):
0275                                 usedPedeMem = float(mem)
0276                             else:
0277                                 print('mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem)
0278 
0279                 # check memory usage
0280                 # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
0281                 if lib.pedeMem > 0 and usedPedeMem > 0.:
0282                     memoryratio = usedPedeMem /(lib.pedeMem/1024.)
0283                     # print a warning if more than approx. 4 GB have been
0284                     # requested of which less than 75% are used by Pede
0285                     if lib.pedeMem > 4000 and memoryratio < 0.75 :
0286                         msg = ("Warning: {0:.2f} GB of memory for Pede "
0287                                "requested, but only {1:.1f}% of it has been "
0288                                "used! Consider to request less memory in order "
0289                                "to save resources.")
0290                         print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0291                     elif memoryratio > 1 :
0292                         msg = ("Warning: {0:.2f} GB of memory for Pede "
0293                                "requested, but {1:.1f}% of this has been "
0294                                "used! Consider to request more memory to avoid "
0295                                "premature removal of the job by the admin.")
0296                         print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0297                     else:
0298                         msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
0299                                "which has been requested for Pede.")
0300                         print(msg.format(memoryratio*100, lib.pedeMem/1024.0))
0301 
0302 
0303                 # clean up /tmp/pede.dump if needed
0304                 if eazeLog == '/tmp/pede.dump':
0305                     os.system('rm /tmp/pede.dump')
0306 
0307             # pede.dump not found or no read-access
0308             else:
0309                 print('mps_check.py cannot find',eazeLog,'to test')
0310 
0311             # if there is a millepede.log file, check it as well
0312             eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
0313             logZipped = 'no'
0314             if os.access(eazeLog+'.gz', os.R_OK):
0315                 os.system('gunzip '+eazeLog+'.gz')
0316                 logZipped = 'true'
0317 
0318             if os.access(eazeLog, os.R_OK):
0319                 # open log file
0320                 with open(eazeLog, "r") as INFILE:
0321                     # scan records in input file
0322                     for line in INFILE:
0323                         # Checks for Pede Errors
0324                         if re.search(re.compile('step no descending',re.M), line):
0325                             pedeLogErr = 1
0326                             pedeLogErrStr += line
0327                         if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
0328                             pedeLogErr = 1
0329                             pedeLogErrStr += line
0330                         # AP 07.09.2009 - Checks for Pede Warnings:
0331                         if re.search(re.compile('insufficient constraint equations',re.M), line):
0332                             pedeLogWrn = 1
0333                             pedeLogWrnStr += line
0334 
0335                 if logZipped == 'true':
0336                     os.system('gzip -f '+eazeLog)
0337             else:
0338                 print('mps_check.py cannot find',eazeLog,'to test')
0339 
0340 
0341             # check millepede.end -- added F. Meier 03.03.2015
0342             eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
0343             logZipped = 'no'
0344             if os.access(eazeLog+'.gz', os.R_OK):
0345                 os.system('gunzip'+eazeLog+'.gz')
0346                 logZipped = 'true'
0347 
0348             if os.access(eazeLog, os.R_OK):
0349                 # open log file
0350                 with open(eazeLog, "r") as INFILE:
0351                     # scan records in input file
0352                     for line in INFILE:
0353                         # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
0354                         # searches the line for a number with or without a sign
0355                         match = re.search(re.compile('([-+]?\d+)',re.M), line)
0356                         if match:
0357                             if int(match.group(1)) == 1:
0358                                 pedeLogWrn = 1
0359                                 pedeLogWrnStr += line
0360                             elif int(match.group(1)) != 0:
0361                                 pedeLogErr = 1
0362                                 pedeLogErrStr += line
0363                 if logZipped == 'true':
0364                     os.system('gzip -f '+eazeLog)
0365             else:
0366                 print('mps_check.py cannot find',eazeLog,'to test')
0367 
0368         # end of merge job checks
0369         # evaluate Errors:
0370         farmhost = ' '
0371 
0372         okStatus = 'OK'
0373         if not eofile == 1:
0374             print(lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file')
0375             okStatus = 'ABEND'
0376         if quotaspace == 1:
0377             print(lib.JOBDIR[i],lib.JOBID[i],'had quota space problem')
0378             okStatus = 'FAIL'
0379             remark = 'eos quota space problem'
0380         if ioprob == 1:
0381             print(lib.JOBDIR[i],lib.JOBID[i],'had I/O problem')
0382             okStatus = 'FAIL'
0383         if fw8001 == 1:
0384             print(lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem')
0385             remark = 'fwk error 8001'
0386             okStatus = 'FAIL'
0387         if timeout == 1:
0388             print(lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem')
0389             remark = 'connection timed out'
0390         if cfgerr == 1:
0391             print(lib.JOBDIR[i],lib.JOBID[i],'had config file error')
0392             remark = 'cfg file error'
0393             okStatus = 'FAIL'
0394         if killed == 1:
0395             guess = " (probably time exceeded)" if kill_reason is None else ":"
0396             print(lib.JOBDIR[i], lib.JOBID[i], "Job killed" + guess)
0397             if kill_reason is not None: print(kill_reason)
0398             remark = "killed";
0399             okStatus = "FAIL"
0400         if timel == 1:
0401             print(lib.JOBDIR[i],lib.JOBID[i],'ran into time limit')
0402             okStatus = 'TIMEL'
0403         if tooManyTracks == 1:
0404             print(lib.JOBDIR[i],lib.JOBID[i],'too many tracks')
0405         if segviol == 1:
0406             print(lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered')
0407             remark = 'seg viol'
0408             okStatus = 'FAIL'
0409         if rfioerr == 1:
0410             print(lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered')
0411             remark = 'rfio error'
0412             okStatus = 'FAIL'
0413         if quota == 1:
0414             print(lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota')
0415         if exceptionCaught == 1:
0416             print(lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun')
0417             remark = 'Exception caught'
0418             okStatus = 'FAIL'
0419         if emptyDatErr == 1:
0420             print('milleBinary???.dat file not found or empty')
0421             remark = 'empty milleBinary'
0422             if emptyDatOnFarm > 0:
0423                 print('...but already empty on farm so OK (or check job',i+1,'yourself...)')
0424             else:
0425                 okStatus = 'FAIL'
0426         if cmdNotFound == 1:
0427             print(lib.JOBDIR[i],lib.JOBID[i],'Command not found')
0428             remark = 'cmd not found'
0429             okStatus = 'FAIL'
0430         if insuffPriv == 1:
0431             print(lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files')
0432             remark = 'Could not rfcp files'
0433             okStatus = 'FAIL'
0434         if pedeAbend == 1:
0435             print(lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally')
0436             remark = 'pede failed'
0437             okStatus = 'FAIL'
0438         if pedeLogErr == 1:
0439             print(lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:')
0440             print(pedeLogErrStr)
0441             remark = 'pede error'
0442             okStatus = 'FAIL'
0443         if pedeLogWrn == 1:
0444             # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
0445             print(lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:')
0446             print(pedeLogWrnStr)
0447             remark = 'pede warnings'
0448             okStatus = 'WARN'
0449         if endofjob != 1:
0450             print(lib.JOBDIR[i],lib.JOBID[i],'Job not ended')
0451             remark = 'job not ended'
0452             okStatus = 'FAIL'
0453         if copyerr == 1 and ispede!=1:
0454             #Copy errors in pede job can occur when a nonexistent file is commented in alignment_merge.py but not in theScript.sh, and in that case is *not* a failure
0455             print(lib.JOBDIR[i],lib.JOBID[i],'Copy to eos failed')
0456             remark = 'copy to eos failed'
0457             okStatus = 'FAIL'
0458 
0459 
0460         # print warning line to stdout
0461         if okStatus != "OK":
0462             print(lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus)
0463 
0464         # update number of events
0465         lib.JOBNEVT[i] = nEvent
0466         # udate Jobstatus
0467         lib.JOBSTATUS[i] = disabled+okStatus
0468         # update cputime
0469         lib.JOBRUNTIME[i] = cputime
0470         # update remark
0471         lib.JOBREMARK[i] = remark
0472         # update host
0473         ##lib.JOBHOST[i] = farmhost
0474 
0475 lib.write_db()
0476