Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-25 02:29:11

0001 import os
0002 
0003 def getName(log):
0004    if len(log)<10:
0005       return ("Unable to get name (log too short)")
0006    else:
0007       Aag = None
0008       for line in log:
0009          if "/store/express/Run2017" in line:
0010             Aag = "AfterAbortGap" in line
0011       if Aag == None:
0012          return ("Unable to get name (no AAG info found)")
0013       runId=""
0014       for line in log[15:]:
0015          if "Processing files" in line:
0016             runId = line.replace("Processing files ","").replace("to","").replace("of run","").split()
0017             break
0018       if runId=="":
0019          return("Unable to get name (run numbers not found)")
0020       return("%s %s %s AAG = %s"%(runId[2],runId[0],runId[1],Aag))
0021       #return log[4][log[9].find("calibTree"):].replace(".root","").replace("_"," ")
0022 def checkRelaunch(log):
0023    log = log.split(" AAG = ")
0024    if log[1]=="False":
0025       relaunchFile = "FailledRun.txt"
0026    elif log[1]=="True":
0027       relaunchFile = "FailledRun_Aag.txt"
0028    else:
0029       return (-1)
0030    
0031    relaunched = 0
0032    n=0
0033    with open(relaunchFile,"r") as f:
0034       for line in f:
0035          n+=1
0036          if log[0] in line:
0037             relaunched = 1
0038             break
0039    return(relaunched)
0040       
0041 def relaunchShort(folder):
0042    cmd=""
0043    if not "LSFJOB" in os.listdir(folder):
0044       print("Unable to open command file")
0045       return -1
0046    with open(folder+"/LSFJOB","r") as f:
0047       for line in f:
0048          if "job_starter" in line:
0049             cmd=line.split("job_starter")[1]
0050             break
0051    if cmd=="":
0052       print("Unable to get command...")
0053       return -1
0054    AAG       = "SiStripCalMinBiasAfterAbortGap" in cmd
0055    cmd=cmd.split(" ")
0056    for i in range(len(cmd)):
0057       if cmd[i]=="--firstFile":
0058          firstFile = cmd[i+1].replace("'",'').replace("\n","")
0059       if cmd[i]=="-r":
0060          run = cmd[i+1].replace("'",'').replace("\n","")
0061       if cmd[i]=="-f":
0062          nRuns = len(cmd[i+1].split(","))-1
0063    print("Found run specs : %s %s %s (AAG : %s)"%(run,firstFile,int(firstFile)+int(nRuns),AAG))
0064    name = "%s %s %s AAG = %s"%(run,firstFile,int(firstFile)+int(nRuns),AAG)
0065    x = checkRelaunch(name)   
0066    if x==0:
0067       print("Run not found in relaunch!!!")
0068       relaunch(folder,name)
0069    elif x==1:
0070       print("file in relaunch list")
0071       remove(folder)
0072    else:
0073       print("Unable to check if file in relaunch")
0074 
0075 def remove(folder):
0076    os.system("rm -r %s"%folder)
0077    print("deleted !")
0078 
0079 def relaunch(folder,log):
0080    log = log.split(" AAG = ")
0081    if log[1]=="False":
0082       relaunchFile = "FailledRun.txt"
0083    elif log[1]=="True":
0084       relaunchFile = "FailledRun_Aag.txt"
0085    else:
0086       print("ERROR, unable to get run type")
0087       return (-1)
0088    os.system("echo %s >> %s"%(log[0],relaunchFile))
0089    print("Added to relaunch list (%s)."%relaunchFile)
0090    remove(folder)
0091 
0092 def getCollection(log):
0093    for x in log:
0094       if "ALCARECOSiStripCalMinBias" in x:
0095          return("Std")
0096       if "ALCARECOSiStripCalMinBiasAfterAbortGap" in x:
0097          return("Aag")
0098    return ("None")
0099 
0100 
0101 
0102 NoError = []
0103 Error = []
0104 for folder in os.listdir("."):
0105    if "core." in folder[:5]:
0106       print("removing %s"%folder)
0107       os.system("rm %s"%folder)
0108    if not folder[:3]=="LSF" or not os.path.isdir(folder):
0109       continue
0110    if not "STDOUT" in os.listdir(folder):
0111       print("Error, no STDOUT file in folder %s !"%folder)
0112       continue 
0113    log = open(folder+"/STDOUT","r").read()
0114    
0115    if "stageout" in log[-2000:]:
0116       NoError.append(folder)
0117    else:
0118       Error.append(folder)
0119 
0120 if len(NoError)+len(Error)==0:
0121    print("Nothing to do...")
0122 ToKeep = []
0123 
0124 for f in NoError:
0125    log = open(f+"/STDOUT","r").read()
0126    if "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log:
0127       Error.append(f)
0128    elif "The file size is" in log[-2000:]:
0129       print("Removing good run %s - %s"%(f,getName(log.split("\n"))))
0130       remove(f)
0131    else:
0132       print("Something fishy in %s (marked as good)"%f)
0133       ToKeep.append(f)
0134 
0135 for f in Error:
0136    toRemove=False
0137    log= open(f+"/STDOUT","r").read()
0138    log=log.split("\n")
0139    eMessage=""
0140    if len(log)<80:
0141       print("Short in %s (%s)"%(f,len(log)))
0142       relaunchShort(f)
0143       logLen=len(log)
0144    else:
0145       logLen=80
0146    for i in range(logLen):
0147       if "Disk quota exceeded" in log[-i]:
0148          eMessage = "Disk quota exceeded"
0149          toRemove=True
0150          break
0151       elif "cms.untracked.vstring('ProductNotFound')" in log[-i]:
0152          eMessage = "Product not found"
0153          toRemove = True
0154          break
0155       elif "Can not interpret the query (while creating DASQuery)" in log[-i]:
0156          eMessage = "DAS query not understood"
0157          toRemove=False
0158          break
0159       elif "client timeout after" in log[-i]:
0160          eMessage = "DAS timeout"
0161          toRemove=False
0162       elif "No such file or directory" in log[-i]:
0163          eMessage = "Can't open input file"
0164          toRemove=True
0165          break
0166       elif "Network dropped connection on reset" in log[-i]:
0167          eMessage = "Network dropped connection on reset"
0168          toRemove=True
0169          break
0170       elif "Job Failed with ExitCode" in log[-i]:
0171          eMessage= log[-i]
0172          toRemove=True
0173          break
0174       elif "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log[-i]:
0175          eMessage= "Bad stageout status."
0176          toRemove=True
0177          break
0178    if not eMessage == "" and toRemove:
0179       name = getName(log)
0180       if "Unable" in name:
0181          print("%s - %s"%(f,name))
0182          continue
0183       print("Removing bad run %s - %s (%s)"%(f,eMessage,name))
0184       code = checkRelaunch(name)
0185       if code==-1:
0186          print("ABORT, unable to get relaunch status")
0187       elif code==0:
0188          relaunch(f,name)
0189       else:
0190          print("Found in relaunch list... deleting...")
0191          remove(f)
0192    else:
0193       print("Something fishy in %s"%f)
0194       ToKeep.append(f)
0195 
0196 if len(ToKeep)>0:
0197    print("Strange jobs :")
0198    for i in ToKeep: print(i)