File indexing completed on 2023-03-17 10:44:25
0001 from __future__ import print_function
0002 import os
0003
0004 def getName(log):
0005 if len(log)<10:
0006 return ("Unable to get name (log too short)")
0007 else:
0008 Aag = None
0009 for line in log:
0010 if "/store/express/Run2017" in line:
0011 Aag = "AfterAbortGap" in line
0012 if Aag == None:
0013 return ("Unable to get name (no AAG info found)")
0014 runId=""
0015 for line in log[15:]:
0016 if "Processing files" in line:
0017 runId = line.replace("Processing files ","").replace("to","").replace("of run","").split()
0018 break
0019 if runId=="":
0020 return("Unable to get name (run numbers not found)")
0021 return("%s %s %s AAG = %s"%(runId[2],runId[0],runId[1],Aag))
0022
0023 def checkRelaunch(log):
0024 log = log.split(" AAG = ")
0025 if log[1]=="False":
0026 relaunchFile = "FailledRun.txt"
0027 elif log[1]=="True":
0028 relaunchFile = "FailledRun_Aag.txt"
0029 else:
0030 return (-1)
0031
0032 relaunched = 0
0033 n=0
0034 with open(relaunchFile,"r") as f:
0035 for line in f:
0036 n+=1
0037 if log[0] in line:
0038 relaunched = 1
0039 break
0040 return(relaunched)
0041
0042 def relaunchShort(folder):
0043 cmd=""
0044 if not "LSFJOB" in os.listdir(folder):
0045 print("Unable to open command file")
0046 return -1
0047 with open(folder+"/LSFJOB","r") as f:
0048 for line in f:
0049 if "job_starter" in line:
0050 cmd=line.split("job_starter")[1]
0051 break
0052 if cmd=="":
0053 print("Unable to get command...")
0054 return -1
0055 AAG = "SiStripCalMinBiasAfterAbortGap" in cmd
0056 cmd=cmd.split(" ")
0057 for i in range(len(cmd)):
0058 if cmd[i]=="--firstFile":
0059 firstFile = cmd[i+1].replace("'",'').replace("\n","")
0060 if cmd[i]=="-r":
0061 run = cmd[i+1].replace("'",'').replace("\n","")
0062 if cmd[i]=="-f":
0063 nRuns = len(cmd[i+1].split(","))-1
0064 print("Found run specs : %s %s %s (AAG : %s)"%(run,firstFile,int(firstFile)+int(nRuns),AAG))
0065 name = "%s %s %s AAG = %s"%(run,firstFile,int(firstFile)+int(nRuns),AAG)
0066 x = checkRelaunch(name)
0067 if x==0:
0068 print("Run not found in relaunch!!!")
0069 relaunch(folder,name)
0070 elif x==1:
0071 print("file in relaunch list")
0072 remove(folder)
0073 else:
0074 print("Unable to check if file in relaunch")
0075
0076 def remove(folder):
0077 os.system("rm -r %s"%folder)
0078 print("deleted !")
0079
0080 def relaunch(folder,log):
0081 log = log.split(" AAG = ")
0082 if log[1]=="False":
0083 relaunchFile = "FailledRun.txt"
0084 elif log[1]=="True":
0085 relaunchFile = "FailledRun_Aag.txt"
0086 else:
0087 print("ERROR, unable to get run type")
0088 return (-1)
0089 os.system("echo %s >> %s"%(log[0],relaunchFile))
0090 print("Added to relaunch list (%s)."%relaunchFile)
0091 remove(folder)
0092
0093 def getCollection(log):
0094 for x in log:
0095 if "ALCARECOSiStripCalMinBias" in x:
0096 return("Std")
0097 if "ALCARECOSiStripCalMinBiasAfterAbortGap" in x:
0098 return("Aag")
0099 return ("None")
0100
0101
0102
0103 NoError = []
0104 Error = []
0105 for folder in os.listdir("."):
0106 if "core." in folder[:5]:
0107 print("removing %s"%folder)
0108 os.system("rm %s"%folder)
0109 if not folder[:3]=="LSF" or not os.path.isdir(folder):
0110 continue
0111 if not "STDOUT" in os.listdir(folder):
0112 print("Error, no STDOUT file in folder %s !"%folder)
0113 continue
0114 log = open(folder+"/STDOUT","r").read()
0115
0116 if "stageout" in log[-2000:]:
0117 NoError.append(folder)
0118 else:
0119 Error.append(folder)
0120
0121 if len(NoError)+len(Error)==0:
0122 print("Nothing to do...")
0123 ToKeep = []
0124
0125 for f in NoError:
0126 log = open(f+"/STDOUT","r").read()
0127 if "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log:
0128 Error.append(f)
0129 elif "The file size is" in log[-2000:]:
0130 print("Removing good run %s - %s"%(f,getName(log.split("\n"))))
0131 remove(f)
0132 else:
0133 print("Something fishy in %s (marked as good)"%f)
0134 ToKeep.append(f)
0135
0136 for f in Error:
0137 toRemove=False
0138 log= open(f+"/STDOUT","r").read()
0139 log=log.split("\n")
0140 eMessage=""
0141 if len(log)<80:
0142 print("Short in %s (%s)"%(f,len(log)))
0143 relaunchShort(f)
0144 logLen=len(log)
0145 else:
0146 logLen=80
0147 for i in range(logLen):
0148 if "Disk quota exceeded" in log[-i]:
0149 eMessage = "Disk quota exceeded"
0150 toRemove=True
0151 break
0152 elif "cms.untracked.vstring('ProductNotFound')" in log[-i]:
0153 eMessage = "Product not found"
0154 toRemove = True
0155 break
0156 elif "Can not interpret the query (while creating DASQuery)" in log[-i]:
0157 eMessage = "DAS query not understood"
0158 toRemove=False
0159 break
0160 elif "client timeout after" in log[-i]:
0161 eMessage = "DAS timeout"
0162 toRemove=False
0163 elif "No such file or directory" in log[-i]:
0164 eMessage = "Can't open input file"
0165 toRemove=True
0166 break
0167 elif "Network dropped connection on reset" in log[-i]:
0168 eMessage = "Network dropped connection on reset"
0169 toRemove=True
0170 break
0171 elif "Job Failed with ExitCode" in log[-i]:
0172 eMessage= log[-i]
0173 toRemove=True
0174 break
0175 elif "WARNING WARNING WARNING STAGE OUT FAILED BUT NOT RELAUNCHED" in log[-i]:
0176 eMessage= "Bad stageout status."
0177 toRemove=True
0178 break
0179 if not eMessage == "" and toRemove:
0180 name = getName(log)
0181 if "Unable" in name:
0182 print("%s - %s"%(f,name))
0183 continue
0184 print("Removing bad run %s - %s (%s)"%(f,eMessage,name))
0185 code = checkRelaunch(name)
0186 if code==-1:
0187 print("ABORT, unable to get relaunch status")
0188 elif code==0:
0189 relaunch(f,name)
0190 else:
0191 print("Found in relaunch list... deleting...")
0192 remove(f)
0193 else:
0194 print("Something fishy in %s"%f)
0195 ToKeep.append(f)
0196
0197 if len(ToKeep)>0:
0198 print("Strange jobs :")
0199 for i in ToKeep: print(i)