File indexing completed on 2025-01-14 23:16:26
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 from builtins import range
0015 import Alignment.MillePedeAlignmentAlgorithm.mpslib.Mpslibclass as mpslib
0016 import subprocess
0017 import re
0018 import os
0019
0020 lib = mpslib.jobdatabase()
0021 lib.read_db()
0022
0023
0024 command = ["ls", "-l", os.path.join(lib.mssDir, "binaries")]
0025 try:
0026 eoslsoutput = subprocess.check_output(command, stderr=subprocess.STDOUT).decode().split('\n')
0027 except subprocess.CalledProcessError:
0028 eoslsoutput = ""
0029
0030
0031 for i in range(len(lib.JOBID)):
0032
0033 batchSuccess = 0
0034 batchExited = 0
0035 finished = 0
0036 endofjob = 0
0037 badInputFile = 0
0038 eofile = 1
0039 timel = 0
0040 killed = 0
0041 ioprob = 0
0042 fw8001 = 0
0043 tooManyTracks = 0
0044 segviol = 0
0045 rfioerr = 0
0046 quota = 0
0047 nEvent = 0
0048 cputime = -1
0049 pedeAbend = 0
0050 pedeLogErr = 0
0051 pedeLogWrn = 0
0052 exceptionCaught = 0
0053 timeout = 0
0054 cfgerr = 0
0055 emptyDatErr = 0
0056 emptyDatOnFarm = 0
0057 cmdNotFound = 0
0058 insuffPriv = 0
0059 quotaspace = 0
0060 copyerr=0
0061 ispede=0
0062 mismatchedInputFiles = 0
0063 missingBinaryfile = 0
0064 missingTreefile = 0
0065 missingMonitor = 0
0066 zeroEvents = 0
0067
0068 kill_reason = None
0069 pedeLogErrStr = ""
0070 pedeLogWrnStr = ""
0071 remark = ""
0072
0073 disabled = "";
0074 if 'DISABLED' in lib.JOBSTATUS[i]:
0075 disabled = 'DISABLED'
0076
0077 if 'FETCH' in lib.JOBSTATUS[i]:
0078
0079
0080 stdOut = 'jobData/'+lib.JOBDIR[i]+'/STDOUT'
0081
0082 if os.access(stdOut+'.gz', os.R_OK):
0083 os.system('gunzip '+stdOut+'.gz')
0084
0085 try:
0086 with open(stdOut, "r") as STDFILE:
0087
0088
0089
0090
0091 for line in STDFILE:
0092 if re.search(re.compile('Unable to access quota space',re.M|re.I), line):
0093 quotaspace = 1
0094 if re.search(re.compile('Unable to get quota space',re.M|re.I), line):
0095 quotaspace = 1
0096 if re.search(re.compile('Disk quota exceeded',re.M|re.I), line):
0097 quotaspace = 1
0098 if re.search(re.compile('CERN report: Job Killed',re.M), line):
0099 killed = 1
0100 if re.search(re.compile('Job finished',re.M), line):
0101 finished = 1
0102 if re.search(re.compile('connection timed out',re.M), line):
0103 timeout = 1
0104 if re.search(re.compile('ConfigFileReadError',re.M), line):
0105 cfgerr = 1
0106 if re.search(re.compile('0 bytes transferred',re.M), line):
0107 emptyDatOnFarm = 1
0108 if re.search(re.compile('command not found',re.M), line):
0109 cmdNotFound = 1
0110
0111 if re.search(re.compile('stage_put: Insufficient user privileges',re.M), line):
0112 insuffPriv = 1
0113 if re.search(re.compile('Give up doing',re.M), line):
0114 copyerr = 1
0115 if re.search(re.compile('Directory content before',re.M),line):
0116 ispede = 1
0117
0118
0119
0120
0121 match = re.search(re.compile('This process used .+?(\d+) KSI2K seconds',re.M|re.I), line)
0122 if match:
0123 cpuFactor = 2.125
0124 cputime = int(round(int(match.group(1))/cpuFactor))
0125 if re.search(re.compile('Missing milleBinary',re.M), line):
0126 missingBinaryfile = 1
0127 if re.search(re.compile('Missing treeFile',re.M), line):
0128 missingTreefile = 1
0129 if re.search(re.compile('Missing Monitor',re.M), line):
0130 missingMonitor = 1
0131
0132
0133 print('Checked '+stdOut)
0134 os.system('gzip -f '+stdOut)
0135 except IOError as e:
0136 if e.args == (2, "No such file or directory"):
0137 print("mps_check.py cannot find", stdOut, "to test")
0138 else:
0139 raise
0140
0141
0142 try:
0143 log_file = os.path.join("jobData", lib.JOBDIR[i], "HTCJOB")
0144 condor_log = subprocess.check_output(["condor_q", lib.JOBID[i],
0145 "-userlog", log_file,
0146 "-af",
0147 "RemoteUserCpu",
0148 "JobStatus",
0149 "RemoveReason"],
0150 stderr = subprocess.STDOUT).decode()
0151 condor_log = condor_log.split()
0152
0153 cputime = int(round(float(condor_log[0])))
0154
0155 if condor_log[1] == "3":
0156 killed = 1
0157 kill_reason = " ".join(condor_log[2:])
0158
0159 except subprocess.CalledProcessError as e:
0160 pass
0161
0162
0163
0164 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/cmsRun.out'
0165 if os.access(eazeLog, os.R_OK):
0166
0167 with open(eazeLog, "r") as INFILE:
0168
0169 for line in INFILE:
0170
0171 if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0172 eofile = 1
0173 if re.search(re.compile('Time limit reached\.',re.M), line):
0174 timel = 1
0175 if re.search(re.compile('gives I\/O problem',re.M), line):
0176 ioprob = 1
0177 if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0178 fw8001 = 1
0179 if re.search(re.compile('too many tracks',re.M), line):
0180 tooManyTracks = 1
0181 if re.search(re.compile('segmentation violation',re.M), line):
0182 segviol = 1
0183 if re.search(re.compile('failed RFIO error',re.M), line):
0184 rfioerr = 1
0185 if re.search(re.compile('Request exceeds quota',re.M), line):
0186 quota = 1
0187
0188
0189 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/alignment.log'
0190 logZipped = 'no'
0191
0192 if os.access(eazeLog+'.gz', os.R_OK):
0193 os.system('gunzip '+eazeLog+'.gz')
0194 logZipped = 'true'
0195
0196 if os.access(eazeLog, os.R_OK):
0197
0198 with open(eazeLog,'r') as INFILE:
0199
0200 for line in INFILE:
0201
0202 if re.search(re.compile('Failed to open the file',re.M), line):
0203 badInputFile = 1
0204 break
0205
0206 if re.search(re.compile('\<StorageStatistics\>',re.M), line):
0207 eofile = 1
0208 if re.search(re.compile('EAZE\. Time limit reached\.',re.M), line):
0209 timel = 1
0210 if re.search(re.compile('GAF gives I\/O problem',re.M), line):
0211 ioprob = 1
0212 if re.search(re.compile('FrameworkError ExitStatus=[\'\"]8001[\'\"]',re.M), line):
0213 fw8001 = 1
0214 if re.search(re.compile('too many tracks',re.M), line):
0215 tooManyTracks = 1
0216 if re.search(re.compile('segmentation violation',re.M), line):
0217 segviol = 1
0218 if re.search(re.compile('failed RFIO error',re.M), line):
0219 rfioerr = 1
0220 if re.search(re.compile('Request exceeds quota',re.M), line):
0221 quota = 1
0222
0223 if re.search(re.compile('Fatal Exception',re.M), line):
0224 exceptionCaught = 1
0225 if re.search(re.compile('Exception caught in cmsRun',re.M), line):
0226 exceptionCaught = 1
0227
0228 if re.search(re.compile('AlignmentProducerAsAnalyzer::endJob\(\)',re.M), line):
0229 endofjob = 1
0230 if re.search(re.compile('MismatchedInputFiles',re.M), line):
0231 mismatchedInputFiles = 1
0232 if re.search(re.compile('Did not process any events',re.M), line):
0233 zeroEvents = 1
0234 if re.search(re.compile('FwkReport -i main_input:sourc',re.M), line):
0235 array = line.split()
0236 nEvent = int(array[5])
0237 if nEvent==0 and re.search(re.compile('FwkReport -i PostSource',re.M), line):
0238 array = line.split()
0239 nEvent = int(array[5])
0240
0241 if nEvent==0 and re.search(re.compile('FwkReport -i AfterSource',re.M), line):
0242 array = line.split()
0243 nEvent = int(array[5])
0244
0245 if nEvent==0:
0246 x = re.search(r"FwkReport -f AfterSource\s+(\d+)",line)
0247 if x:
0248 nEvent = int(x.group(1))
0249
0250 if logZipped == 'true':
0251 os.system('gzip -f '+eazeLog)
0252
0253 else:
0254 print('mps_check.py cannot find',eazeLog,'to test')
0255
0256 endofjob = 1
0257
0258
0259 if i<lib.nJobs:
0260 milleOut = 'milleBinary%03d.dat' % (i+1)
0261
0262
0263
0264
0265 mOutSize = 0
0266 for line in eoslsoutput:
0267 if milleOut in line:
0268 columns = line.split()
0269 mOutSize = int(columns[4])
0270 if not (mOutSize>0):
0271 emptyDatErr = 1
0272
0273
0274 else:
0275
0276 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/pede.dump'
0277 if os.access(eazeLog+'.gz', os.R_OK):
0278
0279 os.system('rm -f /tmp/pede.dump')
0280 os.system('gunzip -c '+eazeLog+'.gz > /tmp/pede.dump')
0281 eazeLog = '/tmp/pede.dump'
0282 if os.access(eazeLog, os.R_OK):
0283 with open(eazeLog, "r") as INFILE:
0284
0285 pedeAbend = 1
0286 usedPedeMem = 0.
0287 for line in INFILE:
0288
0289 if re.search(re.compile('Millepede II.* ending',re.M), line):
0290 pedeAbend = 0
0291
0292 match = re.search(re.compile('Peak dynamic memory allocation: (.+) GB',re.I), line)
0293 if match:
0294 mem = match.group(1)
0295 mem = re.sub('\s', '', mem)
0296
0297 if re.search(re.compile('^\d+\.\d+$',re.M), mem):
0298 usedPedeMem = float(mem)
0299 else:
0300 print('mps_check.py: Found Pede peak memory allocation but extracted number is not a float:',mem)
0301
0302
0303
0304 if lib.pedeMem > 0 and usedPedeMem > 0.:
0305 memoryratio = usedPedeMem /(lib.pedeMem/1024.)
0306
0307
0308 if lib.pedeMem > 4000 and memoryratio < 0.75 :
0309 msg = ("Warning: {0:.2f} GB of memory for Pede "
0310 "requested, but only {1:.1f}% of it has been "
0311 "used! Consider to request less memory in order "
0312 "to save resources.")
0313 print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0314 elif memoryratio > 1 :
0315 msg = ("Warning: {0:.2f} GB of memory for Pede "
0316 "requested, but {1:.1f}% of this has been "
0317 "used! Consider to request more memory to avoid "
0318 "premature removal of the job by the admin.")
0319 print(msg.format(lib.pedeMem/1024.0, memoryratio*100))
0320 else:
0321 msg = ("Info: Used {0:.1f}% of {1:.2f} GB of memory "
0322 "which has been requested for Pede.")
0323 print(msg.format(memoryratio*100, lib.pedeMem/1024.0))
0324
0325
0326
0327 if eazeLog == '/tmp/pede.dump':
0328 os.system('rm /tmp/pede.dump')
0329
0330
0331 else:
0332 print('mps_check.py cannot find',eazeLog,'to test')
0333
0334
0335 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.log'
0336 logZipped = 'no'
0337 if os.access(eazeLog+'.gz', os.R_OK):
0338 os.system('gunzip '+eazeLog+'.gz')
0339 logZipped = 'true'
0340
0341 if os.access(eazeLog, os.R_OK):
0342
0343 with open(eazeLog, "r") as INFILE:
0344
0345 for line in INFILE:
0346
0347 if re.search(re.compile('step no descending',re.M), line):
0348 pedeLogErr = 1
0349 pedeLogErrStr += line
0350 if re.search(re.compile('Constraint equation discrepancies:',re.M), line):
0351 pedeLogErr = 1
0352 pedeLogErrStr += line
0353
0354 if re.search(re.compile('insufficient constraint equations',re.M), line):
0355 pedeLogWrn = 1
0356 pedeLogWrnStr += line
0357
0358 if logZipped == 'true':
0359 os.system('gzip -f '+eazeLog)
0360 else:
0361 print('mps_check.py cannot find',eazeLog,'to test')
0362
0363
0364
0365 eazeLog = 'jobData/'+lib.JOBDIR[i]+'/millepede.end'
0366 logZipped = 'no'
0367 if os.access(eazeLog+'.gz', os.R_OK):
0368 os.system('gunzip'+eazeLog+'.gz')
0369 logZipped = 'true'
0370
0371 if os.access(eazeLog, os.R_OK):
0372
0373 with open(eazeLog, "r") as INFILE:
0374
0375 for line in INFILE:
0376
0377
0378 match = re.search(re.compile('([-+]?\d+)',re.M), line)
0379 if match:
0380 if int(match.group(1)) == 1:
0381 pedeLogWrn = 1
0382 pedeLogWrnStr += line
0383 elif int(match.group(1)) != 0:
0384 pedeLogErr = 1
0385 pedeLogErrStr += line
0386 if logZipped == 'true':
0387 os.system('gzip -f '+eazeLog)
0388 else:
0389 print('mps_check.py cannot find',eazeLog,'to test')
0390
0391
0392
0393 farmhost = ' '
0394
0395 okStatus = 'OK'
0396 if badInputFile == 1:
0397 print(lib.JOBDIR[i],lib.JOBID[i],'had FileOpenError')
0398 okStatus = 'FAIL'
0399 remark = 'FileOpenError'
0400 if not eofile == 1:
0401 print(lib.JOBDIR[i],lib.JOBID[i],'did not reach end of file')
0402 okStatus = 'ABEND'
0403 if quotaspace == 1:
0404 print(lib.JOBDIR[i],lib.JOBID[i],'had quota space problem')
0405 okStatus = 'FAIL'
0406 remark = 'eos quota space problem'
0407 if ioprob == 1:
0408 print(lib.JOBDIR[i],lib.JOBID[i],'had I/O problem')
0409 okStatus = 'FAIL'
0410 if fw8001 == 1:
0411 print(lib.JOBDIR[i],lib.JOBID[i],'had Framework error 8001 problem')
0412 remark = 'fwk error 8001'
0413 okStatus = 'FAIL'
0414 if timeout == 1:
0415 print(lib.JOBDIR[i],lib.JOBID[i],'had connection timed out problem')
0416 remark = 'connection timed out'
0417 if cfgerr == 1:
0418 print(lib.JOBDIR[i],lib.JOBID[i],'had config file error')
0419 remark = 'cfg file error'
0420 okStatus = 'FAIL'
0421 if killed == 1:
0422 guess = " (probably time exceeded)" if kill_reason is None else ":"
0423 print(lib.JOBDIR[i], lib.JOBID[i], "Job killed" + guess)
0424 if kill_reason is not None: print(kill_reason)
0425 remark = "killed";
0426 okStatus = "FAIL"
0427 if timel == 1:
0428 print(lib.JOBDIR[i],lib.JOBID[i],'ran into time limit')
0429 okStatus = 'TIMEL'
0430 if tooManyTracks == 1:
0431 print(lib.JOBDIR[i],lib.JOBID[i],'too many tracks')
0432 if segviol == 1:
0433 print(lib.JOBDIR[i],lib.JOBID[i],'SEGVIOL encountered')
0434 remark = 'seg viol'
0435 okStatus = 'FAIL'
0436 if rfioerr == 1:
0437 print(lib.JOBDIR[i],lib.JOBID[i],'RFIO error encountered')
0438 remark = 'rfio error'
0439 okStatus = 'FAIL'
0440 if quota == 1:
0441 print(lib.JOBDIR[i],lib.JOBID[i],'Request exceeds quota')
0442 if exceptionCaught == 1:
0443 print(lib.JOBDIR[i],lib.JOBID[i],'Exception caught in cmsrun')
0444 remark = 'Exception caught'
0445 okStatus = 'FAIL'
0446 if emptyDatErr == 1:
0447 print('milleBinary???.dat file not found or empty')
0448 remark = 'empty milleBinary'
0449 if emptyDatOnFarm > 0:
0450 print('...but already empty on farm so OK (or check job',i+1,'yourself...)')
0451 else:
0452 okStatus = 'FAIL'
0453 if cmdNotFound == 1:
0454 print(lib.JOBDIR[i],lib.JOBID[i],'Command not found')
0455 remark = 'cmd not found'
0456 okStatus = 'FAIL'
0457 if insuffPriv == 1:
0458 print(lib.JOBDIR[i],lib.JOBID[i],'Insufficient privileges to rfcp files')
0459 remark = 'Could not rfcp files'
0460 okStatus = 'FAIL'
0461 if pedeAbend == 1:
0462 print(lib.JOBDIR[i],lib.JOBID[i],'Pede did not end normally')
0463 remark = 'pede failed'
0464 okStatus = 'FAIL'
0465 if pedeLogErr == 1:
0466 print(lib.JOBDIR[i],lib.JOBID[i],'Problems in running Pede:')
0467 print(pedeLogErrStr)
0468 remark = 'pede error'
0469 okStatus = 'FAIL'
0470 if pedeLogWrn == 1:
0471
0472 print(lib.JOBDIR[i],lib.JOBID[i],'Warnings in running Pede:')
0473 print(pedeLogWrnStr)
0474 remark = 'pede warnings'
0475 okStatus = 'WARN'
0476 if endofjob != 1:
0477 print(lib.JOBDIR[i],lib.JOBID[i],'Job not ended')
0478 remark = 'job not ended'
0479 okStatus = 'FAIL'
0480 if copyerr == 1 and ispede!=1:
0481
0482 print(lib.JOBDIR[i],lib.JOBID[i],'Copy to eos failed')
0483 remark = 'copy to eos failed'
0484 okStatus = 'FAIL'
0485
0486 if missingBinaryfile == 1:
0487 print(lib.JOBDIR[i],lib.JOBID[i],'Missing binary file')
0488 remark = 'Missing binary file'
0489 okStatus = 'FAIL'
0490 if missingTreefile == 1:
0491 print(lib.JOBDIR[i],lib.JOBID[i],'Missing Treefile')
0492 remark = 'Missing Treefile'
0493 okStatus = 'FAIL'
0494 if missingMonitor == 1:
0495 print(lib.JOBDIR[i],lib.JOBID[i],'Missing Monitor')
0496 remark = 'Missing Monitor file'
0497 okStatus = 'FAIL'
0498 if zeroEvents == 1:
0499 print(lib.JOBDIR[i],lib.JOBID[i],'Did not process any events')
0500 remark = 'Did not process any events'
0501 okStatus = 'FAIL'
0502 if mismatchedInputFiles == 1:
0503 print(lib.JOBDIR[i],lib.JOBID[i],'MismatchedInputFiles')
0504 remark = 'MismatchedInputFiles'
0505 okStatus = 'FAIL'
0506
0507 if okStatus != "OK":
0508 print(lib.JOBDIR[i],lib.JOBID[i],' -------- ',okStatus)
0509
0510
0511 lib.JOBNEVT[i] = nEvent
0512
0513 lib.JOBSTATUS[i] = disabled+okStatus
0514
0515 lib.JOBRUNTIME[i] = cputime
0516
0517 lib.JOBREMARK[i] = remark
0518
0519
0520
0521 lib.write_db()
0522