Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-01-14 23:16:26

0001 #!/usr/bin/env python3
0002 #
0003 #  This script checks outputs from jobs that have FETCH status and updates if errors occured
0004 #  -> check STDOUT files
0005 #  -> check cmsRun.out
0006 #  -> check alignment.log
0007 #  -> check if millebinaries are on eos
0008 #  -> check pede.dump
0009 #  -> check millepede.log
0010 #  -> check millepede.end
0011 #
0012 #  It also retirieves number of events from alignment.log and cputime from STDOUT
0013 
0014 from builtins import range
0015 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
0016 import subprocess
0017 import re
0018 import os
0019 
0020 lib = mpslib.jobdatabase()
0021 lib.read_db()
0022 
0023 # create a list of eos ls entries containing files on eos binary store
0024 command = ["ls", "-l", os.path.join(lib.mssDir, "binaries")]
0025 try:
0026     eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).decode().split('\n')
0027 except subprocess.CalledProcessError:
0028     eoslsoutput = ""
0029 
0030 # loop over FETCH jobs
0031 for i in range(len(lib.JOBID)):
0032     # FIXME use bools?
0033     batchSuccess = 0
0034     batchExited = 0
0035     finished = 0
0036     endofjob = 0
0037     badInputFile = 0
0038     eofile = 1  # do not deal with timel yet
0039     timel = 0
0040     killed = 0
0041     ioprob = 0
0042     fw8001 = 0
0043     tooManyTracks = 0
0044     segviol = 0
0045     rfioerr = 0
0046     quota = 0
0047     nEvent = 0
0048     cputime = -1
0049     pedeAbend = 0
0050     pedeLogErr = 0
0051     pedeLogWrn = 0
0052     exceptionCaught = 0
0053     timeout = 0
0054     cfgerr = 0
0055     emptyDatErr = 0
0056     emptyDatOnFarm = 0
0057     cmdNotFound = 0
0058     insuffPriv = 0
0059     quotaspace = 0
0060     copyerr=0
0061     ispede=0
0062     mismatchedInputFiles = 0
0063     missingBinaryfile = 0
0064     missingTreefile = 0
0065     missingMonitor = 0
0066     zeroEvents = 0
0067 
0068     kill_reason = None
0069     pedeLogErrStr = ""
0070     pedeLogWrnStr = ""
0071     remark = ""
0072 
0073     disabled = "";
0074     if 'DISABLED' in lib.JOBSTATUS[i]:
0075         disabled = 'DISABLED'
0076 
0077     if 'FETCH' in lib.JOBSTATUS[i]:
0078 
0079         # open the STDOUT file
0080         stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
0081         # unzip the STDOUT file if necessary
0082         if os.access(stdOut+'.gz', os.R_OK):
0083             os.system('gunzip '+stdOut+'.gz')
0084 
0085         try:
0086             with open(stdOut, "r") as STDFILE:
0087                 # scan records in input file.
0088                 # use regular expression to search. re.compile needed for options re.M and re.I
0089                 # re.M=re.MULTILINE enables matching of newline char
0090                 # re.I=re.IGNORECASE makes matching case-insensitive.
0091                 for line in STDFILE:
0092                     if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
0093                         quotaspace = 1
0094                     if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
0095                         quotaspace = 1
0096                     if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
0097                         quotaspace = 1
0098                     if re.search(re.compile('CERN report: Job Killed',re.M), line):
0099                         killed = 1
0100                     if re.search(re.compile('Job finished',re.M), line):
0101                         finished = 1
0102                     if re.search(re.compile('connection timed out',re.M), line):
0103                         timeout = 1
0104                     if re.search(re.compile('ConfigFileReadError',re.M), line):
0105                         cfgerr = 1
0106                     if re.search(re.compile('0 bytes transferred',re.M), line):
0107                         emptyDatOnFarm = 1
0108                     if re.search(re.compile('command not found',re.M), line):
0109                         cmdNotFound = 1
0110                     # AP 26.11.2009 Insufficient privileges to rfcp files
0111                     if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
0112                         insuffPriv = 1
0113                     if re.search(re.compile('Give up doing',re.M), line):
0114                         copyerr = 1
0115                     if re.search(re.compile('Directory content before',re.M),line):
0116                         ispede = 1
0117                     # AP 05.11.2015 Extract cpu-time.
0118                     # STDOUT doesn't contain NCU anymore. Now KSI2K and HS06 seconds are displayed.
0119                     # The ncuFactor is calculated from few samples by comparing KSI2K seconds with
0120                     # CPU time from email.
0121                     match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
0122                     if match:
0123                         cpuFactor = 2.125
0124                         cputime = int(round(int(match.group(1))/cpuFactor)) # match.group(1) is the matched digit
0125                     if re.search(re.compile('Missing milleBinary',re.M), line):
0126                          missingBinaryfile = 1
0127                     if re.search(re.compile('Missing treeFile',re.M), line):
0128                          missingTreefile = 1
0129                     if re.search(re.compile('Missing Monitor',re.M), line):
0130                          missingMonitor = 1
0131 
0132             # gzip it afterwards:
0133             print('Checked '+stdOut)
0134             os.system('gzip -f '+stdOut)
0135         except IOError as e:
0136             if e.args == (2, "No such file or directory"):
0137                 print("mps_check.py cannot find", stdOut, "to test")
0138             else:
0139                 raise
0140 
0141         # check HTCondor log file
0142         try:
0143             log_file = os.path.join("jobData", lib.JOBDIR[i], "HTCJOB")
0144             condor_log = subprocess.check_output(["condor_q", lib.JOBID[i],
0145                                                   "-userlog", log_file,
0146                                                   "-af",
0147                                                   "RemoteUserCpu",
0148                                                   "JobStatus",
0149                                                   "RemoveReason"],
0150                                                  stderr = subprocess.STDOUT).decode()
0151             condor_log = condor_log.split()
0152 
0153             cputime = int(round(float(condor_log[0])))
0154 
0155             if condor_log[1] == "3": # JobStatus == Removed
0156                 killed = 1
0157                 kill_reason = " ".join(condor_log[2:])
0158 
0159         except subprocess.CalledProcessError as e:
0160             pass
0161 
0162 
0163         # GF: This file is not produced (anymore...) -> check for existence and read-access added
0164         eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
0165         if os.access(eazeLog, os.R_OK):
0166             # open the input file
0167             with open(eazeLog, "r") as INFILE:
0168                 # scan records in input file
0169                 for line in INFILE:
0170                     # check if end of file has been reached
0171                     if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0172                         eofile = 1
0173                     if re.search(re.compile('Time limit reached\.',re.M), line):
0174                         timel = 1
0175                     if re.search(re.compile('gives I\/O problem',re.M), line):
0176                         ioprob = 1
0177                     if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0178                         fw8001 = 1
0179                     if re.search(re.compile('too many tracks',re.M), line):
0180                         tooManyTracks = 1
0181                     if re.search(re.compile('segmentation violation',re.M), line):
0182                         segviol = 1
0183                     if re.search(re.compile('failed RFIO error',re.M), line):
0184                         rfioerr = 1
0185                     if re.search(re.compile('Request exceeds quota',re.M), line):
0186                         quota = 1
0187 
0188         # if there is an alignment.log[.gz] file, check it as well
0189         eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
0190         logZipped = 'no'
0191         # unzip the logfile if necessary
0192         if os.access(eazeLog+'.gz', os.R_OK):
0193             os.system('gunzip '+eazeLog+'.gz')
0194             logZipped = 'true'
0195 
0196         if os.access(eazeLog, os.R_OK):   # access to alignment.log
0197             # open the input file
0198             with open(eazeLog,'r') as INFILE:
0199                 # scan records in input file
0200                 for line in INFILE:
0201                     # check whether any file could not be opened
0202                     if re.search(re.compile('Failed to open the file',re.M), line):
0203                         badInputFile = 1
0204                         break
0205                     # check if end of file has been reached
0206                     if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0207                         eofile = 1
0208                     if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
0209                         timel = 1
0210                     if re.search(re.compile('GAF gives I\/O problem',re.M), line):
0211                         ioprob = 1
0212                     if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0213                         fw8001 = 1
0214                     if re.search(re.compile('too many tracks',re.M), line):
0215                         tooManyTracks = 1
0216                     if re.search(re.compile('segmentation violation',re.M), line):
0217                         segviol = 1
0218                     if re.search(re.compile('failed RFIO error',re.M), line):
0219                         rfioerr = 1
0220                     if re.search(re.compile('Request exceeds quota',re.M), line):
0221                         quota = 1
0222                     # check for newer (e.g. CMSSW_5_1_X) and older CMSSW:
0223                     if re.search(re.compile('Fatal Exception',re.M), line):
0224                         exceptionCaught = 1
0225                     if re.search(re.compile('Exception caught in cmsRun',re.M), line):
0226                         exceptionCaught = 1
0227                     # AP 07.09.2009 - Check that the job got to a normal end
0228                     if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
0229                         endofjob = 1
0230                     if re.search(re.compile('MismatchedInputFiles',re.M), line):
0231                         mismatchedInputFiles = 1
0232                     if re.search(re.compile('Did not process any events',re.M), line):
0233                         zeroEvents = 1
0234                     if re.search(re.compile('FwkReport            -i main_input:sourc',re.M), line):
0235                         array = line.split()
0236                         nEvent = int(array[5])
0237                     if nEvent==0 and re.search(re.compile('FwkReport            -i PostSource',re.M), line):
0238                         array = line.split()
0239                         nEvent = int(array[5])
0240                     # AP 31.07.2009 - To read number of events in CMSSW_3_2_2_patch2
0241                     if nEvent==0 and re.search(re.compile('FwkReport            -i AfterSource',re.M), line):
0242                         array = line.split()
0243                         nEvent = int(array[5])
0244                     # RM 03.02.2023
0245                     if nEvent==0:
0246                         x = re.search(r"FwkReport            -f AfterSource\s+(\d+)",line)
0247                         if x:
0248                             nEvent = int(x.group(1))
0249 
0250             if logZipped == 'true':
0251                 os.system('gzip -f '+eazeLog)
0252 
0253         else:   # no access to alignment.log
0254             print('mps_check.py cannot find',eazeLog,'to test')
0255             # AP 07.09.2009 - The following check cannot be done: set to 1 to avoid fake error type
0256             endofjob = 1
0257 
0258         # for mille jobs checks that milleBinary file is not empty
0259         if i<lib.nJobs:  # mille job!
0260             milleOut = 'milleBinary%03d.dat' % (i+1)
0261             # from Perl, should be deprecated because of cmsls and nsls
0262             #(not translated to python, yes I'm lazy... use subprocess.checkout if needed):
0263             #$mOutSize = `nsls -l $mssDir | grep $milleOut | head -1 | awk '{print \$5}'`;
0264             #$mOutSize = `cmsLs -l $mssDir | grep $milleOut | head -1 | awk '{print \$2}'`;
0265             mOutSize = 0
0266             for line in eoslsoutput:
0267                 if milleOut in line:
0268                     columns = line.split()
0269                     mOutSize = int(columns[4]) # 5th column = size
0270             if not (mOutSize>0):
0271                 emptyDatErr = 1
0272 
0273         # merge jobs: additional checks for merging job
0274         else:
0275             # if there is a pede.dump file check it as well
0276             eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
0277             if os.access(eazeLog+'.gz', os.R_OK):
0278                 # unzip - but clean before and save to tmp
0279                 os.system('rm -f /tmp/pede.dump')
0280                 os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
0281                 eazeLog = '/tmp/pede.dump'
0282             if os.access(eazeLog, os.R_OK):
0283                 with open(eazeLog, "r") as INFILE: # open pede.dump
0284                     # scan records in INFILE
0285                     pedeAbend = 1
0286                     usedPedeMem = 0.
0287                     for line in INFILE:
0288                         # check if pede has reached its normal end
0289                         if re.search(re.compile('Millepede II.* ending',re.M), line):
0290                             pedeAbend = 0
0291                         # extract memory usage
0292                         match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
0293                         if match:
0294                             mem = match.group(1)
0295                             mem = re.sub('\s', '', mem)
0296                             # if mem is a float
0297                             if re.search(re.compile('^\d+\.\d+$',re.M), mem):
0298                                 usedPedeMem = float(mem)
0299                             else:
0300                                 print('mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem)
0301 
0302                 # check memory usage
0303                 # no point in asking if lib.pedeMem is defined. Initialized as lib.pedeMem=-1
0304                 if lib.pedeMem > 0 and usedPedeMem > 0.:
0305                     memoryratio = usedPedeMem /(lib.pedeMem/1024.)
0306                     # print a warning if more than approx. 4 GB have been
0307                     # requested of which less than 75% are used by Pede
0308                     if lib.pedeMem > 4000 and memoryratio < 0.75 :
0309                         msg = ("Warning: {0:.2f} GB of memory for Pede "
0310                                "requested, but only {1:.1f}% of it has been "
0311                                "used! Consider to request less memory in order "
0312                                "to save resources.")
0313                         print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0314                     elif memoryratio > 1 :
0315                         msg = ("Warning: {0:.2f} GB of memory for Pede "
0316                                "requested, but {1:.1f}% of this has been "
0317                                "used! Consider to request more memory to avoid "
0318                                "premature removal of the job by the admin.")
0319                         print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0320                     else:
0321                         msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
0322                                "which has been requested for Pede.")
0323                         print(msg.format(memoryratio*100, lib.pedeMem/1024.0))
0324 
0325 
0326                 # clean up /tmp/pede.dump if needed
0327                 if eazeLog == '/tmp/pede.dump':
0328                     os.system('rm /tmp/pede.dump')
0329 
0330             # pede.dump not found or no read-access
0331             else:
0332                 print('mps_check.py cannot find',eazeLog,'to test')
0333 
0334             # if there is a millepede.log file, check it as well
0335             eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
0336             logZipped = 'no'
0337             if os.access(eazeLog+'.gz', os.R_OK):
0338                 os.system('gunzip '+eazeLog+'.gz')
0339                 logZipped = 'true'
0340 
0341             if os.access(eazeLog, os.R_OK):
0342                 # open log file
0343                 with open(eazeLog, "r") as INFILE:
0344                     # scan records in input file
0345                     for line in INFILE:
0346                         # Checks for Pede Errors
0347                         if re.search(re.compile('step no descending',re.M), line):
0348                             pedeLogErr = 1
0349                             pedeLogErrStr += line
0350                         if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
0351                             pedeLogErr = 1
0352                             pedeLogErrStr += line
0353                         # AP 07.09.2009 - Checks for Pede Warnings:
0354                         if re.search(re.compile('insufficient constraint equations',re.M), line):
0355                             pedeLogWrn = 1
0356                             pedeLogWrnStr += line
0357 
0358                 if logZipped == 'true':
0359                     os.system('gzip -f '+eazeLog)
0360             else:
0361                 print('mps_check.py cannot find',eazeLog,'to test')
0362 
0363 
0364             # check millepede.end -- added F. Meier 03.03.2015
0365             eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
0366             logZipped = 'no'
0367             if os.access(eazeLog+'.gz', os.R_OK):
0368                 os.system('gunzip'+eazeLog+'.gz')
0369                 logZipped = 'true'
0370 
0371             if os.access(eazeLog, os.R_OK):
0372                 # open log file
0373                 with open(eazeLog, "r") as INFILE:
0374                     # scan records in input file
0375                     for line in INFILE:
0376                         # Checks for the output code. 0 is OK, 1 is WARN, anything else is FAIL
0377                         # searches the line for a number with or without a sign
0378                         match = re.search(re.compile('([-+]?\d+)',re.M), line)
0379                         if match:
0380                             if int(match.group(1)) == 1:
0381                                 pedeLogWrn = 1
0382                                 pedeLogWrnStr += line
0383                             elif int(match.group(1)) != 0:
0384                                 pedeLogErr = 1
0385                                 pedeLogErrStr += line
0386                 if logZipped == 'true':
0387                     os.system('gzip -f '+eazeLog)
0388             else:
0389                 print('mps_check.py cannot find',eazeLog,'to test')
0390 
0391         # end of merge job checks
0392         # evaluate Errors:
0393         farmhost = ' '
0394 
0395         okStatus = 'OK'
0396         if badInputFile == 1:
0397             print(lib.JOBDIR[i],lib.JOBID[i],'had FileOpenError')
0398             okStatus = 'FAIL'
0399             remark = 'FileOpenError'
0400         if not eofile == 1:
0401             print(lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file')
0402             okStatus = 'ABEND'
0403         if quotaspace == 1:
0404             print(lib.JOBDIR[i],lib.JOBID[i],'had quota space problem')
0405             okStatus = 'FAIL'
0406             remark = 'eos quota space problem'
0407         if ioprob == 1:
0408             print(lib.JOBDIR[i],lib.JOBID[i],'had I/O problem')
0409             okStatus = 'FAIL'
0410         if fw8001 == 1:
0411             print(lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem')
0412             remark = 'fwk error 8001'
0413             okStatus = 'FAIL'
0414         if timeout == 1:
0415             print(lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem')
0416             remark = 'connection timed out'
0417         if cfgerr == 1:
0418             print(lib.JOBDIR[i],lib.JOBID[i],'had config file error')
0419             remark = 'cfg file error'
0420             okStatus = 'FAIL'
0421         if killed == 1:
0422             guess = " (probably time exceeded)" if kill_reason is None else ":"
0423             print(lib.JOBDIR[i], lib.JOBID[i], "Job killed" + guess)
0424             if kill_reason is not None: print(kill_reason)
0425             remark = "killed";
0426             okStatus = "FAIL"
0427         if timel == 1:
0428             print(lib.JOBDIR[i],lib.JOBID[i],'ran into time limit')
0429             okStatus = 'TIMEL'
0430         if tooManyTracks == 1:
0431             print(lib.JOBDIR[i],lib.JOBID[i],'too many tracks')
0432         if segviol == 1:
0433             print(lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered')
0434             remark = 'seg viol'
0435             okStatus = 'FAIL'
0436         if rfioerr == 1:
0437             print(lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered')
0438             remark = 'rfio error'
0439             okStatus = 'FAIL'
0440         if quota == 1:
0441             print(lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota')
0442         if exceptionCaught == 1:
0443             print(lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun')
0444             remark = 'Exception caught'
0445             okStatus = 'FAIL'
0446         if emptyDatErr == 1:
0447             print('milleBinary???.dat file not found or empty')
0448             remark = 'empty milleBinary'
0449             if emptyDatOnFarm > 0:
0450                 print('...but already empty on farm so OK (or check job',i+1,'yourself...)')
0451             else:
0452                 okStatus = 'FAIL'
0453         if cmdNotFound == 1:
0454             print(lib.JOBDIR[i],lib.JOBID[i],'Command not found')
0455             remark = 'cmd not found'
0456             okStatus = 'FAIL'
0457         if insuffPriv == 1:
0458             print(lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files')
0459             remark = 'Could not rfcp files'
0460             okStatus = 'FAIL'
0461         if pedeAbend == 1:
0462             print(lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally')
0463             remark = 'pede failed'
0464             okStatus = 'FAIL'
0465         if pedeLogErr == 1:
0466             print(lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:')
0467             print(pedeLogErrStr)
0468             remark = 'pede error'
0469             okStatus = 'FAIL'
0470         if pedeLogWrn == 1:
0471             # AP 07.09.2009 - Reports Pede Warnings (but do _not_ set job status to FAIL)
0472             print(lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:')
0473             print(pedeLogWrnStr)
0474             remark = 'pede warnings'
0475             okStatus = 'WARN'
0476         if endofjob != 1:
0477             print(lib.JOBDIR[i],lib.JOBID[i],'Job not ended')
0478             remark = 'job not ended'
0479             okStatus = 'FAIL'
0480         if copyerr == 1 and ispede!=1:
0481             #Copy errors in pede job can occur when a nonexistent file is commented in alignment_merge.py but not in theScript.sh, and in that case is *not* a failure
0482             print(lib.JOBDIR[i],lib.JOBID[i],'Copy to eos failed')
0483             remark = 'copy to eos failed'
0484             okStatus = 'FAIL'
0485 
0486         if missingBinaryfile == 1:
0487             print(lib.JOBDIR[i],lib.JOBID[i],'Missing binary file')
0488             remark = 'Missing binary file'
0489             okStatus = 'FAIL'
0490         if missingTreefile == 1:
0491             print(lib.JOBDIR[i],lib.JOBID[i],'Missing Treefile')
0492             remark = 'Missing Treefile'
0493             okStatus = 'FAIL'
0494         if missingMonitor == 1:
0495             print(lib.JOBDIR[i],lib.JOBID[i],'Missing Monitor')
0496             remark = 'Missing Monitor file'
0497             okStatus = 'FAIL'
0498         if zeroEvents == 1:
0499             print(lib.JOBDIR[i],lib.JOBID[i],'Did not process any events')
0500             remark = 'Did not process any events'
0501             okStatus = 'FAIL'
0502         if mismatchedInputFiles == 1:
0503             print(lib.JOBDIR[i],lib.JOBID[i],'MismatchedInputFiles')
0504             remark = 'MismatchedInputFiles'
0505             okStatus = 'FAIL'
0506         # print warning line to stdout
0507         if okStatus != "OK":
0508             print(lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus)
0509 
0510         # update number of events
0511         lib.JOBNEVT[i] = nEvent
0512         # udate Jobstatus
0513         lib.JOBSTATUS[i] = disabled+okStatus
0514         # update cputime
0515         lib.JOBRUNTIME[i] = cputime
0516         # update remark
0517         lib.JOBREMARK[i] = remark
0518         # update host
0519         ##lib.JOBHOST[i] = farmhost
0520 
0521 lib.write_db()
0522