File indexing completed on 2023-03-17 10:39:31
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 from __future__ import print_function
0015 from builtins import range
0016 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
0017 import subprocess
0018 import re
0019 import os
0020
0021 lib = mpslib.jobdatabase()
0022 lib.read_db()
0023
0024
0025 command = ["ls", "-l", os.path.join(lib.mssDir, "binaries")]
0026 try:
0027 eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).decode().split('\n')
0028 except subprocess.CalledProcessError:
0029 eoslsoutput = ""
0030
0031
0032 for i in range(len(lib.JOBID)):
0033
0034 batchSuccess = 0
0035 batchExited = 0
0036 finished = 0
0037 endofjob = 0
0038 eofile = 1
0039 timel = 0
0040 killed = 0
0041 ioprob = 0
0042 fw8001 = 0
0043 tooManyTracks = 0
0044 segviol = 0
0045 rfioerr = 0
0046 quota = 0
0047 nEvent = 0
0048 cputime = -1
0049 pedeAbend = 0
0050 pedeLogErr = 0
0051 pedeLogWrn = 0
0052 exceptionCaught = 0
0053 timeout = 0
0054 cfgerr = 0
0055 emptyDatErr = 0
0056 emptyDatOnFarm = 0
0057 cmdNotFound = 0
0058 insuffPriv = 0
0059 quotaspace = 0
0060 copyerr=0
0061 ispede=0
0062
0063 kill_reason = None
0064 pedeLogErrStr = ""
0065 pedeLogWrnStr = ""
0066 remark = ""
0067
0068 disabled = "";
0069 if 'DISABLED' in lib.JOBSTATUS[i]:
0070 disabled = 'DISABLED'
0071
0072 if 'FETCH' in lib.JOBSTATUS[i]:
0073
0074
0075 stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
0076
0077 if os.access(stdOut+'.gz', os.R_OK):
0078 os.system('gunzip '+stdOut+'.gz')
0079
0080 try:
0081 with open(stdOut, "r") as STDFILE:
0082
0083
0084
0085
0086 for line in STDFILE:
0087 if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
0088 quotaspace = 1
0089 if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
0090 quotaspace = 1
0091 if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
0092 quotaspace = 1
0093 if re.search(re.compile('CERN report: Job Killed',re.M), line):
0094 killed = 1
0095 if re.search(re.compile('Job finished',re.M), line):
0096 finished = 1
0097 if re.search(re.compile('connection timed out',re.M), line):
0098 timeout = 1
0099 if re.search(re.compile('ConfigFileReadError',re.M), line):
0100 cfgerr = 1
0101 if re.search(re.compile('0 bytes transferred',re.M), line):
0102 emptyDatOnFarm = 1
0103 if re.search(re.compile('command not found',re.M), line):
0104 cmdNotFound = 1
0105
0106 if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
0107 insuffPriv = 1
0108 if re.search(re.compile('Give up doing',re.M), line):
0109 copyerr = 1
0110 if re.search(re.compile('Directory content before',re.M),line):
0111 ispede = 1
0112
0113
0114
0115
0116 match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
0117 if match:
0118 cpuFactor = 2.125
0119 cputime = int(round(int(match.group(1))/cpuFactor))
0120
0121
0122 print('gzip -f '+stdOut)
0123 os.system('gzip -f '+stdOut)
0124 except IOError as e:
0125 if e.args == (2, "No such file or directory"):
0126 print("mps_check.py cannot find", stdOut, "to test")
0127 else:
0128 raise
0129
0130
0131 try:
0132 log_file = os.path.join("jobData", lib.JOBDIR[i], "HTCJOB")
0133 condor_log = subprocess.check_output(["condor_q", lib.JOBID[i],
0134 "-userlog", log_file,
0135 "-af",
0136 "RemoteSysCpu",
0137 "JobStatus",
0138 "RemoveReason"],
0139 stderr = subprocess.STDOUT).decode()
0140 condor_log = condor_log.split()
0141
0142 cputime = int(round(float(condor_log[0])))
0143
0144 if condor_log[1] == "3":
0145 killed = 1
0146 kill_reason = " ".join(condor_log[2:])
0147
0148 except subprocess.CalledProcessError as e:
0149 pass
0150
0151
0152
0153 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
0154 if os.access(eazeLog, os.R_OK):
0155
0156 with open(eazeLog, "r") as INFILE:
0157
0158 for line in INFILE:
0159
0160 if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0161 eofile = 1
0162 if re.search(re.compile('Time limit reached\.',re.M), line):
0163 timel = 1
0164 if re.search(re.compile('gives I\/O problem',re.M), line):
0165 ioprob = 1
0166 if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0167 fw8001 = 1
0168 if re.search(re.compile('too many tracks',re.M), line):
0169 tooManyTracks = 1
0170 if re.search(re.compile('segmentation violation',re.M), line):
0171 segviol = 1
0172 if re.search(re.compile('failed RFIO error',re.M), line):
0173 rfioerr = 1
0174 if re.search(re.compile('Request exceeds quota',re.M), line):
0175 quota = 1
0176
0177
0178 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
0179 logZipped = 'no'
0180
0181 if os.access(eazeLog+'.gz', os.R_OK):
0182 os.system('gunzip '+eazeLog+'.gz')
0183 logZipped = 'true'
0184
0185 if os.access(eazeLog, os.R_OK):
0186
0187 with open(eazeLog,'r') as INFILE:
0188
0189 for line in INFILE:
0190
0191 if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0192 eofile = 1
0193 if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
0194 timel = 1
0195 if re.search(re.compile('GAF gives I\/O problem',re.M), line):
0196 ioprob = 1
0197 if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0198 fw8001 = 1
0199 if re.search(re.compile('too many tracks',re.M), line):
0200 tooManyTracks = 1
0201 if re.search(re.compile('segmentation violation',re.M), line):
0202 segviol = 1
0203 if re.search(re.compile('failed RFIO error',re.M), line):
0204 rfioerr = 1
0205 if re.search(re.compile('Request exceeds quota',re.M), line):
0206 quota = 1
0207
0208 if re.search(re.compile('Fatal Exception',re.M), line):
0209 exceptionCaught = 1
0210 if re.search(re.compile('Exception caught in cmsRun',re.M), line):
0211 exceptionCaught = 1
0212
0213 if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
0214 endofjob = 1
0215 if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
0216 array = line.split()
0217 nEvent = int(array[5])
0218 if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
0219 array = line.split()
0220 nEvent = int(array[5])
0221
0222 if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
0223 array = line.split()
0224 nEvent = int(array[5])
0225
0226 if logZipped == 'true':
0227 os.system('gzip -f '+eazeLog)
0228
0229 else:
0230 print('mps_check.py cannot find',eazeLog,'to test')
0231
0232 endofjob = 1
0233
0234
0235 if i<lib.nJobs:
0236 milleOut = 'milleBinary%03d.dat' % (i+1)
0237
0238
0239
0240
0241 mOutSize = 0
0242
0243 for line in eoslsoutput:
0244 if milleOut in line:
0245 columns = line.split()
0246 mOutSize = int(columns[4])
0247 if not (mOutSize>0):
0248 emptyDatErr = 1
0249
0250
0251 else:
0252
0253 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
0254 if os.access(eazeLog+'.gz', os.R_OK):
0255
0256 os.system('rm -f /tmp/pede.dump')
0257 os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
0258 eazeLog = '/tmp/pede.dump'
0259 if os.access(eazeLog, os.R_OK):
0260 with open(eazeLog, "r") as INFILE:
0261
0262 pedeAbend = 1
0263 usedPedeMem = 0.
0264 for line in INFILE:
0265
0266 if re.search(re.compile('Millepede II.* ending',re.M), line):
0267 pedeAbend = 0
0268
0269 match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
0270 if match:
0271 mem = match.group(1)
0272 mem = re.sub('\s', '', mem)
0273
0274 if re.search(re.compile('^\d+\.\d+$',re.M), mem):
0275 usedPedeMem = float(mem)
0276 else:
0277 print('mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem)
0278
0279
0280
0281 if lib.pedeMem > 0 and usedPedeMem > 0.:
0282 memoryratio = usedPedeMem /(lib.pedeMem/1024.)
0283
0284
0285 if lib.pedeMem > 4000 and memoryratio < 0.75 :
0286 msg = ("Warning: {0:.2f} GB of memory for Pede "
0287 "requested, but only {1:.1f}% of it has been "
0288 "used! Consider to request less memory in order "
0289 "to save resources.")
0290 print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0291 elif memoryratio > 1 :
0292 msg = ("Warning: {0:.2f} GB of memory for Pede "
0293 "requested, but {1:.1f}% of this has been "
0294 "used! Consider to request more memory to avoid "
0295 "premature removal of the job by the admin.")
0296 print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0297 else:
0298 msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
0299 "which has been requested for Pede.")
0300 print(msg.format(memoryratio*100, lib.pedeMem/1024.0))
0301
0302
0303
0304 if eazeLog == '/tmp/pede.dump':
0305 os.system('rm /tmp/pede.dump')
0306
0307
0308 else:
0309 print('mps_check.py cannot find',eazeLog,'to test')
0310
0311
0312 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
0313 logZipped = 'no'
0314 if os.access(eazeLog+'.gz', os.R_OK):
0315 os.system('gunzip '+eazeLog+'.gz')
0316 logZipped = 'true'
0317
0318 if os.access(eazeLog, os.R_OK):
0319
0320 with open(eazeLog, "r") as INFILE:
0321
0322 for line in INFILE:
0323
0324 if re.search(re.compile('step no descending',re.M), line):
0325 pedeLogErr = 1
0326 pedeLogErrStr += line
0327 if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
0328 pedeLogErr = 1
0329 pedeLogErrStr += line
0330
0331 if re.search(re.compile('insufficient constraint equations',re.M), line):
0332 pedeLogWrn = 1
0333 pedeLogWrnStr += line
0334
0335 if logZipped == 'true':
0336 os.system('gzip -f '+eazeLog)
0337 else:
0338 print('mps_check.py cannot find',eazeLog,'to test')
0339
0340
0341
0342 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
0343 logZipped = 'no'
0344 if os.access(eazeLog+'.gz', os.R_OK):
0345 os.system('gunzip'+eazeLog+'.gz')
0346 logZipped = 'true'
0347
0348 if os.access(eazeLog, os.R_OK):
0349
0350 with open(eazeLog, "r") as INFILE:
0351
0352 for line in INFILE:
0353
0354
0355 match = re.search(re.compile('([-+]?\d+)',re.M), line)
0356 if match:
0357 if int(match.group(1)) == 1:
0358 pedeLogWrn = 1
0359 pedeLogWrnStr += line
0360 elif int(match.group(1)) != 0:
0361 pedeLogErr = 1
0362 pedeLogErrStr += line
0363 if logZipped == 'true':
0364 os.system('gzip -f '+eazeLog)
0365 else:
0366 print('mps_check.py cannot find',eazeLog,'to test')
0367
0368
0369
0370 farmhost = ' '
0371
0372 okStatus = 'OK'
0373 if not eofile == 1:
0374 print(lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file')
0375 okStatus = 'ABEND'
0376 if quotaspace == 1:
0377 print(lib.JOBDIR[i],lib.JOBID[i],'had quota space problem')
0378 okStatus = 'FAIL'
0379 remark = 'eos quota space problem'
0380 if ioprob == 1:
0381 print(lib.JOBDIR[i],lib.JOBID[i],'had I/O problem')
0382 okStatus = 'FAIL'
0383 if fw8001 == 1:
0384 print(lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem')
0385 remark = 'fwk error 8001'
0386 okStatus = 'FAIL'
0387 if timeout == 1:
0388 print(lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem')
0389 remark = 'connection timed out'
0390 if cfgerr == 1:
0391 print(lib.JOBDIR[i],lib.JOBID[i],'had config file error')
0392 remark = 'cfg file error'
0393 okStatus = 'FAIL'
0394 if killed == 1:
0395 guess = " (probably time exceeded)" if kill_reason is None else ":"
0396 print(lib.JOBDIR[i], lib.JOBID[i], "Job killed" + guess)
0397 if kill_reason is not None: print(kill_reason)
0398 remark = "killed";
0399 okStatus = "FAIL"
0400 if timel == 1:
0401 print(lib.JOBDIR[i],lib.JOBID[i],'ran into time limit')
0402 okStatus = 'TIMEL'
0403 if tooManyTracks == 1:
0404 print(lib.JOBDIR[i],lib.JOBID[i],'too many tracks')
0405 if segviol == 1:
0406 print(lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered')
0407 remark = 'seg viol'
0408 okStatus = 'FAIL'
0409 if rfioerr == 1:
0410 print(lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered')
0411 remark = 'rfio error'
0412 okStatus = 'FAIL'
0413 if quota == 1:
0414 print(lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota')
0415 if exceptionCaught == 1:
0416 print(lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun')
0417 remark = 'Exception caught'
0418 okStatus = 'FAIL'
0419 if emptyDatErr == 1:
0420 print('milleBinary???.dat file not found or empty')
0421 remark = 'empty milleBinary'
0422 if emptyDatOnFarm > 0:
0423 print('...but already empty on farm so OK (or check job',i+1,'yourself...)')
0424 else:
0425 okStatus = 'FAIL'
0426 if cmdNotFound == 1:
0427 print(lib.JOBDIR[i],lib.JOBID[i],'Command not found')
0428 remark = 'cmd not found'
0429 okStatus = 'FAIL'
0430 if insuffPriv == 1:
0431 print(lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files')
0432 remark = 'Could not rfcp files'
0433 okStatus = 'FAIL'
0434 if pedeAbend == 1:
0435 print(lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally')
0436 remark = 'pede failed'
0437 okStatus = 'FAIL'
0438 if pedeLogErr == 1:
0439 print(lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:')
0440 print(pedeLogErrStr)
0441 remark = 'pede error'
0442 okStatus = 'FAIL'
0443 if pedeLogWrn == 1:
0444
0445 print(lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:')
0446 print(pedeLogWrnStr)
0447 remark = 'pede warnings'
0448 okStatus = 'WARN'
0449 if endofjob != 1:
0450 print(lib.JOBDIR[i],lib.JOBID[i],'Job not ended')
0451 remark = 'job not ended'
0452 okStatus = 'FAIL'
0453 if copyerr == 1 and ispede!=1:
0454
0455 print(lib.JOBDIR[i],lib.JOBID[i],'Copy to eos failed')
0456 remark = 'copy to eos failed'
0457 okStatus = 'FAIL'
0458
0459
0460
0461 if okStatus != "OK":
0462 print(lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus)
0463
0464
0465 lib.JOBNEVT[i] = nEvent
0466
0467 lib.JOBSTATUS[i] = disabled+okStatus
0468
0469 lib.JOBRUNTIME[i] = cputime
0470
0471 lib.JOBREMARK[i] = remark
0472
0473
0474
0475 lib.write_db()
0476