File indexing completed on 2024-11-25 02:29:11
0001 import os
0002
0003 def getName(log):
0004 if len(log)<10:
0005 return ("Unable to get name (log too short)")
0006 else:
0007 Aag = None
0008 for line in log:
0009 if "/store/express/Run2017" in line:
0010 Aag = "AfterAbortGap" in line
0011 if Aag == None:
0012 return ("Unable to get name (no AAG info found)")
0013 runId=""
0014 for line in log[15:]:
0015 if "Processing files" in line:
0016 runId = line.replace("Processing files ","").replace("to","").replace("of run","").split()
0017 break
0018 if runId=="":
0019 return("Unable to get name (run numbers not found)")
0020 return("%s %s %s AAG = %s"%(runId[2],runId[0],runId[1],Aag))
0021
0022 def checkRelaunch(log):
0023 log = log.split(" AAG = ")
0024 if log[1]=="False":
0025 relaunchFile = "FailledRun.txt"
0026 elif log[1]=="True":
0027 relaunchFile = "FailledRun_Aag.txt"
0028 else:
0029 return (-1)
0030
0031 relaunched = 0
0032 n=0
0033 with open(relaunchFile,"r") as f:
0034 for line in f:
0035 n+=1
0036 if log[0] in line:
0037 relaunched = 1
0038 break
0039 return(relaunched)
0040
0041 def relaunchShort(folder):
0042 cmd=""
0043 if not "LSFJOB" in os.listdir(folder):
0044 print("Unable to open command file")
0045 return -1
0046 with open(folder+"/LSFJOB","r") as f:
0047 for line in f:
0048 if "job_starter" in line:
0049 cmd=line.split("job_starter")[1]
0050 break
0051 if cmd=="":
0052 print("Unable to get command...")
0053 return -1
0054 AAG = "SiStripCalMinBiasAfterAbortGap" in cmd
0055 cmd=cmd.split(" ")
0056 for i in range(len(cmd)):
0057 if cmd[i]=="--firstFile":
0058 firstFile = cmd[i+1].replace("'",'').replace("\n","")
0059 if cmd[i]=="-r":
0060 run = cmd[i+1].replace("'",'').replace("\n","")
0061 if cmd[i]=="-f":
0062 nRuns = len(cmd[i+1].split(","))-1
0063 print("Found run specs : %s %s %s (AAG : %s)"%(run,firstFile,int(firstFile)+int(nRuns),AAG))
0064 name = "%s %s %s AAG = %s"%(run,firstFile,int(firstFile)+int(nRuns),AAG)
0065 x = checkRelaunch(name)
0066 if x==0:
0067 print("Run not found in relaunch!!!")
0068 relaunch(folder,name)
0069 elif x==1:
0070 print("file in relaunch list")
0071 remove(folder)
0072 else:
0073 print("Unable to check if file in relaunch")
0074
0075 def remove(folder):
0076 os.system("rm -r %s"%folder)
0077 print("deleted !")
0078
0079 def relaunch(folder,log):
0080 log = log.split(" AAG = ")
0081 if log[1]=="False":
0082 relaunchFile = "FailledRun.txt"
0083 elif log[1]=="True":
0084 relaunchFile = "FailledRun_Aag.txt"
0085 else:
0086 print("ERROR, unable to get run type")
0087 return (-1)
0088 os.system("echo %s >> %s"%(log[0],relaunchFile))
0089 print("Added to relaunch list (%s)."%relaunchFile)
0090 remove(folder)
0091
0092 def getCollection(log):
0093 for x in log:
0094 if "ALCARECOSiStripCalMinBias" in x:
0095 return("Std")
0096 if "ALCARECOSiStripCalMinBiasAfterAbortGap" in x:
0097 return("Aag")
0098 return ("None")
0099
0100
0101
0102 NoError = []
0103 Error = []
0104 for folder in os.listdir("."):
0105 if "core." in folder[:5]:
0106 print("removing %s"%folder)
0107 os.system("rm %s"%folder)
0108 if not folder[:3]=="LSF" or not os.path.isdir(folder):
0109 continue
0110 if not "STDOUT" in os.listdir(folder):
0111 print("Error, no STDOUT file in folder %s !"%folder)
0112 continue
0113 log = open(folder+"/STDOUT","r").read()
0114
0115 if "stageout" in log[-2000:]:
0116 NoError.append(folder)
0117 else:
0118 Error.append(folder)
0119
0120 if len(NoError)+len(Error)==0:
0121 print("Nothing to do...")
0122 ToKeep = []
0123
0124 for f in NoError:
0125 log = open(f+"/STDOUT","r").read()
0126 if "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log:
0127 Error.append(f)
0128 elif "The file size is" in log[-2000:]:
0129 print("Removing good run %s - %s"%(f,getName(log.split("\n"))))
0130 remove(f)
0131 else:
0132 print("Something fishy in %s (marked as good)"%f)
0133 ToKeep.append(f)
0134
0135 for f in Error:
0136 toRemove=False
0137 log= open(f+"/STDOUT","r").read()
0138 log=log.split("\n")
0139 eMessage=""
0140 if len(log)<80:
0141 print("Short in %s (%s)"%(f,len(log)))
0142 relaunchShort(f)
0143 logLen=len(log)
0144 else:
0145 logLen=80
0146 for i in range(logLen):
0147 if "Disk quota exceeded" in log[-i]:
0148 eMessage = "Disk quota exceeded"
0149 toRemove=True
0150 break
0151 elif "cms.untracked.vstring('ProductNotFound')" in log[-i]:
0152 eMessage = "Product not found"
0153 toRemove = True
0154 break
0155 elif "Can not interpret the query (while creating DASQuery)" in log[-i]:
0156 eMessage = "DAS query not understood"
0157 toRemove=False
0158 break
0159 elif "client timeout after" in log[-i]:
0160 eMessage = "DAS timeout"
0161 toRemove=False
0162 elif "No such file or directory" in log[-i]:
0163 eMessage = "Can't open input file"
0164 toRemove=True
0165 break
0166 elif "Network dropped connection on reset" in log[-i]:
0167 eMessage = "Network dropped connection on reset"
0168 toRemove=True
0169 break
0170 elif "Job Failed with ExitCode" in log[-i]:
0171 eMessage= log[-i]
0172 toRemove=True
0173 break
0174 elif "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log[-i]:
0175 eMessage= "Bad stageout status."
0176 toRemove=True
0177 break
0178 if not eMessage == "" and toRemove:
0179 name = getName(log)
0180 if "Unable" in name:
0181 print("%s - %s"%(f,name))
0182 continue
0183 print("Removing bad run %s - %s (%s)"%(f,eMessage,name))
0184 code = checkRelaunch(name)
0185 if code==-1:
0186 print("ABORT, unable to get relaunch status")
0187 elif code==0:
0188 relaunch(f,name)
0189 else:
0190 print("Found in relaunch list... deleting...")
0191 remove(f)
0192 else:
0193 print("Something fishy in %s"%f)
0194 ToKeep.append(f)
0195
0196 if len(ToKeep)>0:
0197 print("Strange jobs :")
0198 for i in ToKeep: print(i)