Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-02-14 14:14:47

0001 from __future__ import print_function
0002 import os
0003 
0004 def getName(log):
0005    if len(log)<10:
0006       return ("Unable to get name (log too short)")
0007    else:
0008       Aag = None
0009       for line in log:
0010          if "/store/express/Run2017" in line:
0011             Aag = "AfterAbortGap" in line
0012       if Aag == None:
0013          return ("Unable to get name (no AAG info found)")
0014       runId=""
0015       for line in log[15:]:
0016          if "Processing files" in line:
0017             runId = line.replace("Processing files ","").replace("to","").replace("of run","").split()
0018             break
0019       if runId=="":
0020          return("Unable to get name (run numbers not found)")
0021       return("%s %s %s AAG = %s"%(runId[2],runId[0],runId[1],Aag))
0022       #return log[4][log[9].find("calibTree"):].replace(".root","").replace("_"," ")
0023 def checkRelaunch(log):
0024    log = log.split(" AAG = ")
0025    if log[1]=="False":
0026       relaunchFile = "FailledRun.txt"
0027    elif log[1]=="True":
0028       relaunchFile = "FailledRun_Aag.txt"
0029    else:
0030       return (-1)
0031    
0032    relaunched = 0
0033    n=0
0034    with open(relaunchFile,"r") as f:
0035       for line in f:
0036          n+=1
0037          if log[0] in line:
0038             relaunched = 1
0039             break
0040    return(relaunched)
0041       
0042 def relaunchShort(folder):
0043    cmd=""
0044    if not "LSFJOB" in os.listdir(folder):
0045       print("Unable to open command file")
0046       return -1
0047    with open(folder+"/LSFJOB","r") as f:
0048       for line in f:
0049          if "job_starter" in line:
0050             cmd=line.split("job_starter")[1]
0051             break
0052    if cmd=="":
0053       print("Unable to get command...")
0054       return -1
0055    AAG       = "SiStripCalMinBiasAfterAbortGap" in cmd
0056    cmd=cmd.split(" ")
0057    for i in range(len(cmd)):
0058       if cmd[i]=="--firstFile":
0059          firstFile = cmd[i+1].replace("'",'').replace("\n","")
0060       if cmd[i]=="-r":
0061          run = cmd[i+1].replace("'",'').replace("\n","")
0062       if cmd[i]=="-f":
0063          nRuns = len(cmd[i+1].split(","))-1
0064    print("Found run specs : %s %s %s (AAG : %s)"%(run,firstFile,int(firstFile)+int(nRuns),AAG))
0065    name = "%s %s %s AAG = %s"%(run,firstFile,int(firstFile)+int(nRuns),AAG)
0066    x = checkRelaunch(name)   
0067    if x==0:
0068       print("Run not found in relaunch!!!")
0069       relaunch(folder,name)
0070    elif x==1:
0071       print("file in relaunch list")
0072       remove(folder)
0073    else:
0074       print("Unable to check if file in relaunch")
0075 
0076 def remove(folder):
0077    os.system("rm -r %s"%folder)
0078    print("deleted !")
0079 
0080 def relaunch(folder,log):
0081    log = log.split(" AAG = ")
0082    if log[1]=="False":
0083       relaunchFile = "FailledRun.txt"
0084    elif log[1]=="True":
0085       relaunchFile = "FailledRun_Aag.txt"
0086    else:
0087       print("ERROR, unable to get run type")
0088       return (-1)
0089    os.system("echo %s >> %s"%(log[0],relaunchFile))
0090    print("Added to relaunch list (%s)."%relaunchFile)
0091    remove(folder)
0092 
0093 def getCollection(log):
0094    for x in log:
0095       if "ALCARECOSiStripCalMinBias" in x:
0096          return("Std")
0097       if "ALCARECOSiStripCalMinBiasAfterAbortGap" in x:
0098          return("Aag")
0099    return ("None")
0100 
0101 
0102 
0103 NoError = []
0104 Error = []
0105 for folder in os.listdir("."):
0106    if "core." in folder[:5]:
0107       print("removing %s"%folder)
0108       os.system("rm %s"%folder)
0109    if not folder[:3]=="LSF" or not os.path.isdir(folder):
0110       continue
0111    if not "STDOUT" in os.listdir(folder):
0112       print("Error, no STDOUT file in folder %s !"%folder)
0113       continue 
0114    log = open(folder+"/STDOUT","r").read()
0115    
0116    if "stageout" in log[-2000:]:
0117       NoError.append(folder)
0118    else:
0119       Error.append(folder)
0120 
0121 if len(NoError)+len(Error)==0:
0122    print("Nothing to do...")
0123 ToKeep = []
0124 
0125 for f in NoError:
0126    log = open(f+"/STDOUT","r").read()
0127    if "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log:
0128       Error.append(f)
0129    elif "The file size is" in log[-2000:]:
0130       print("Removing good run %s - %s"%(f,getName(log.split("\n"))))
0131       remove(f)
0132    else:
0133       print("Something fishy in %s (marked as good)"%f)
0134       ToKeep.append(f)
0135 
0136 for f in Error:
0137    toRemove=False
0138    log= open(f+"/STDOUT","r").read()
0139    log=log.split("\n")
0140    eMessage=""
0141    if len(log)<80:
0142       print("Short in %s (%s)"%(f,len(log)))
0143       relaunchShort(f)
0144       logLen=len(log)
0145    else:
0146       logLen=80
0147    for i in range(logLen):
0148       if "Disk quota exceeded" in log[-i]:
0149          eMessage = "Disk quota exceeded"
0150          toRemove=True
0151          break
0152       elif "cms.untracked.vstring('ProductNotFound')" in log[-i]:
0153          eMessage = "Product not found"
0154          toRemove = True
0155          break
0156       elif "Can not interpret the query (while creating DASQuery)" in log[-i]:
0157          eMessage = "DAS query not understood"
0158          toRemove=False
0159          break
0160       elif "client timeout after" in log[-i]:
0161          eMessage = "DAS timeout"
0162          toRemove=False
0163       elif "No such file or directory" in log[-i]:
0164          eMessage = "Can't open input file"
0165          toRemove=True
0166          break
0167       elif "Network dropped connection on reset" in log[-i]:
0168          eMessage = "Network dropped connection on reset"
0169          toRemove=True
0170          break
0171       elif "Job Failed with ExitCode" in log[-i]:
0172          eMessage= log[-i]
0173          toRemove=True
0174          break
0175       elif "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log[-i]:
0176          eMessage= "Bad stageout status."
0177          toRemove=True
0178          break
0179    if not eMessage == "" and toRemove:
0180       name = getName(log)
0181       if "Unable" in name:
0182          print("%s - %s"%(f,name))
0183          continue
0184       print("Removing bad run %s - %s (%s)"%(f,eMessage,name))
0185       code = checkRelaunch(name)
0186       if code==-1:
0187          print("ABORT, unable to get relaunch status")
0188       elif code==0:
0189          relaunch(f,name)
0190       else:
0191          print("Found in relaunch list... deleting...")
0192          remove(f)
0193    else:
0194       print("Something fishy in %s"%f)
0195       ToKeep.append(f)
0196 
0197 if len(ToKeep)>0:
0198    print("Strange jobs :")
0199    for i in ToKeep: print(i)