File indexing completed on 2023-03-17 10:40:23
0001 from __future__ import print_function
0002 from __future__ import absolute_import
0003
0004
0005
0006 from builtins import range
0007 import bisect
0008 import datetime
0009 import json
0010 import os
0011 import re
0012 import sys
0013
0014 import Utilities.General.cmssw_das_client as das_client
0015 from FWCore.PythonUtilities.LumiList import LumiList
0016
0017 from .helperFunctions import cache
0018 from .TkAlExceptions import AllInOneError
0019
0020 class Dataset(object):
0021 def __init__( self, datasetName, dasLimit = 0, tryPredefinedFirst = True,
0022 cmssw = os.environ["CMSSW_BASE"], cmsswrelease = os.environ["CMSSW_RELEASE_BASE"],
0023 magneticfield = None, dasinstance = None):
0024 self.__name = datasetName
0025 self.__origName = datasetName
0026 self.__dasLimit = dasLimit
0027 self.__dasinstance = dasinstance
0028 self.__cmssw = cmssw
0029 self.__cmsswrelease = cmsswrelease
0030 self.__firstusedrun = None
0031 self.__lastusedrun = None
0032 self.__parentDataset = None
0033
0034
0035 if re.match( r'/.+/.+/.+', self.__name ):
0036 self.__official = True
0037 fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"
0038 else:
0039 self.__official = False
0040 fileName = self.__name + "_cff.py"
0041
0042 searchPath1 = os.path.join( self.__cmssw, "python",
0043 "Alignment", "OfflineValidation",
0044 fileName )
0045 searchPath2 = os.path.join( self.__cmssw, "src",
0046 "Alignment", "OfflineValidation",
0047 "python", fileName )
0048 searchPath3 = os.path.join( self.__cmsswrelease,
0049 "python", "Alignment",
0050 "OfflineValidation", fileName )
0051 if self.__official and not tryPredefinedFirst:
0052 self.__predefined = False
0053 elif os.path.exists( searchPath1 ):
0054 self.__predefined = True
0055 self.__filename = searchPath1
0056 elif os.path.exists( searchPath2 ):
0057 msg = ("The predefined dataset '%s' does exist in '%s', but "
0058 "you need to run 'scram b' first."
0059 %( self.__name, searchPath2 ))
0060 if self.__official:
0061 print(msg)
0062 print("Getting the data from DAS again. To go faster next time, run scram b.")
0063 else:
0064 raise AllInOneError( msg )
0065 elif os.path.exists( searchPath3 ):
0066 self.__predefined = True
0067 self.__filename = searchPath3
0068 elif self.__official:
0069 self.__predefined = False
0070 else:
0071 msg = ("The predefined dataset '%s' does not exist. Please "
0072 "create it first or check for typos."%( self.__name ))
0073 raise AllInOneError( msg )
0074
0075 if self.__predefined and self.__official:
0076 self.__name = "Dataset" + self.__name.replace("/","_")
0077
0078 if magneticfield is not None:
0079 try:
0080 magneticfield = float(magneticfield)
0081 except ValueError:
0082 raise AllInOneError("Bad magneticfield {} which can't be converted to float".format(magneticfield))
0083 self.__inputMagneticField = magneticfield
0084
0085 self.__dataType = self.__getDataType()
0086 self.__magneticField = self.__getMagneticField()
0087
0088
0089 def __chunks( self, theList, n ):
0090 """ Yield successive n-sized chunks from theList.
0091 """
0092 for i in range( 0, len( theList ), n ):
0093 yield theList[i:i+n]
0094
0095 __source_template= ("%(header)s"
0096 "%(importCms)s"
0097 "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
0098 "%(goodLumiSecStr)s"
0099 "readFiles = cms.untracked.vstring()\n"
0100 "secFiles = cms.untracked.vstring()\n"
0101 "%(process)ssource = cms.Source(\"PoolSource\",\n"
0102 "%(lumiStr)s"
0103 "%(tab)s secondaryFileNames ="
0104 "secFiles,\n"
0105 "%(tab)s fileNames = readFiles\n"
0106 ")\n"
0107 "%(files)s\n"
0108 "%(lumiSecExtend)s\n"
0109 "%(process)smaxEvents = cms.untracked.PSet( "
0110 "input = cms.untracked.int32(int(%(nEvents)s)) )\n"
0111 "%(skipEventsString)s\n")
0112
0113 __dummy_source_template = ("readFiles = cms.untracked.vstring()\n"
0114 "secFiles = cms.untracked.vstring()\n"
0115 "%(process)ssource = cms.Source(\"PoolSource\",\n"
0116 "%(tab)s secondaryFileNames ="
0117 "secFiles,\n"
0118 "%(tab)s fileNames = readFiles\n"
0119 ")\n"
0120 "readFiles.extend(['dummy_File.root'])\n"
0121 "%(process)smaxEvents = cms.untracked.PSet( "
0122 "input = cms.untracked.int32(int(%(nEvents)s)) )\n"
0123 "%(skipEventsString)s\n")
0124
0125 def __lumiSelectionSnippet( self, jsonPath = None, firstRun = None, lastRun = None ):
0126 lumiSecExtend = ""
0127 if firstRun or lastRun or jsonPath:
0128 if not jsonPath:
0129 selectedRunList = self.__getRunList()
0130 if firstRun:
0131 selectedRunList = [ run for run in selectedRunList \
0132 if self.__findInJson(run, "run_number") >= firstRun ]
0133 if lastRun:
0134 selectedRunList = [ run for run in selectedRunList \
0135 if self.__findInJson(run, "run_number") <= lastRun ]
0136 lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \
0137 + str( self.__findInJson(run, "run_number") ) + ":max" \
0138 for run in selectedRunList ]
0139 splitLumiList = list( self.__chunks( lumiList, 255 ) )
0140 else:
0141 theLumiList = None
0142 try:
0143 theLumiList = LumiList ( filename = jsonPath )
0144 except ValueError:
0145 pass
0146
0147 if theLumiList is not None:
0148 allRuns = theLumiList.getRuns()
0149 runsToRemove = []
0150 for run in allRuns:
0151 if firstRun and int( run ) < firstRun:
0152 runsToRemove.append( run )
0153 if lastRun and int( run ) > lastRun:
0154 runsToRemove.append( run )
0155 theLumiList.removeRuns( runsToRemove )
0156 splitLumiList = list( self.__chunks(
0157 theLumiList.getCMSSWString().split(','), 255 ) )
0158 if not (splitLumiList and splitLumiList[0] and splitLumiList[0][0]):
0159 splitLumiList = None
0160 else:
0161 with open(jsonPath) as f:
0162 jsoncontents = f.read()
0163 if "process.source.lumisToProcess" in jsoncontents:
0164 msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet. Trying to use it" % jsonPath
0165 if firstRun or lastRun:
0166 msg += ("\n (after applying firstRun and/or lastRun)")
0167 msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."
0168 msg += "\nCheck your config file to make sure that it worked properly."
0169 print(msg)
0170
0171 runlist = self.__getRunList()
0172 if firstRun or lastRun:
0173 self.__firstusedrun = -1
0174 self.__lastusedrun = -1
0175 jsoncontents = re.sub(r"\d+:(\d+|max)(-\d+:(\d+|max))?", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)
0176 jsoncontents = (jsoncontents.replace("'',\n","").replace("''\n","")
0177 .replace('"",\n','').replace('""\n',''))
0178 self.__firstusedrun = max(self.__firstusedrun, int(self.__findInJson(runlist[0],"run_number")))
0179 self.__lastusedrun = min(self.__lastusedrun, int(self.__findInJson(runlist[-1],"run_number")))
0180 if self.__lastusedrun < self.__firstusedrun:
0181 jsoncontents = None
0182 else:
0183 self.__firstusedrun = int(self.__findInJson(runlist[0],"run_number"))
0184 self.__lastusedrun = int(self.__findInJson(runlist[-1],"run_number"))
0185 lumiSecExtend = jsoncontents
0186 splitLumiList = None
0187 else:
0188 raise AllInOneError("%s is not a valid json file!" % jsonPath)
0189
0190 if splitLumiList and splitLumiList[0] and splitLumiList[0][0]:
0191 lumiSecStr = [ "',\n'".join( lumis ) \
0192 for lumis in splitLumiList ]
0193 lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
0194 for lumis in lumiSecStr ]
0195 lumiSecExtend = "\n".join( lumiSecStr )
0196 runlist = self.__getRunList()
0197 self.__firstusedrun = max(int(splitLumiList[0][0].split(":")[0]), int(self.__findInJson(runlist[0],"run_number")))
0198 self.__lastusedrun = min(int(splitLumiList[-1][-1].split(":")[0]), int(self.__findInJson(runlist[-1],"run_number")))
0199 elif lumiSecExtend:
0200 pass
0201 else:
0202 msg = "You are trying to run a validation without any runs! Check that:"
0203 if firstRun or lastRun:
0204 msg += "\n - firstRun/begin and lastRun/end are correct for this dataset, and there are runs in between containing data"
0205 if jsonPath:
0206 msg += "\n - your JSON file is correct for this dataset, and the runs contain data"
0207 if (firstRun or lastRun) and jsonPath:
0208 msg += "\n - firstRun/begin and lastRun/end are consistent with your JSON file"
0209 raise AllInOneError(msg)
0210
0211 else:
0212 if self.__inputMagneticField is not None:
0213 pass
0214 else:
0215 runlist = self.__getRunList()
0216 self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number"))
0217 self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number"))
0218
0219 return lumiSecExtend
0220
0221 def __fileListSnippet(self, crab=False, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
0222 if crab:
0223 files = ""
0224 else:
0225 splitFileList = list( self.__chunks( self.fileList(firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
0226 if not splitFileList:
0227 raise AllInOneError("No files found for dataset {}. Check the spelling, or maybe specify another das instance?".format(self.__name))
0228 fileStr = [ "',\n'".join( files ) for files in splitFileList ]
0229 fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
0230 for files in fileStr ]
0231 files = "\n".join( fileStr )
0232
0233 if parent:
0234 splitParentFileList = list( self.__chunks( self.fileList(parent=True, firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
0235 parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]
0236 parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \
0237 for parentFiles in parentFileStr ]
0238 parentFiles = "\n".join( parentFileStr )
0239 files += "\n\n" + parentFiles
0240
0241 return files
0242
0243 def __createSnippet( self, jsonPath = None, begin = None, end = None,
0244 firstRun = None, lastRun = None, repMap = None,
0245 crab = False, parent = False ):
0246
0247 if firstRun:
0248 firstRun = int( firstRun )
0249 if lastRun:
0250 lastRun = int( lastRun )
0251 if ( begin and firstRun ) or ( end and lastRun ):
0252 msg = ( "The Usage of "
0253 + "'begin' & 'firstRun' " * int( bool( begin and
0254 firstRun ) )
0255 + "and " * int( bool( ( begin and firstRun ) and
0256 ( end and lastRun ) ) )
0257 + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
0258 + "is ambigous." )
0259 raise AllInOneError( msg )
0260 if begin or end:
0261 ( firstRun, lastRun ) = self.convertTimeToRun(
0262 begin = begin, end = end, firstRun = firstRun,
0263 lastRun = lastRun )
0264 if ( firstRun and lastRun ) and ( firstRun > lastRun ):
0265 msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
0266 "chosen is greater than the upper time/runrange limit "
0267 "('end'/'lastRun').")
0268 raise AllInOneError( msg )
0269
0270 lumiSecExtend = self.__lumiSelectionSnippet(jsonPath=jsonPath, firstRun=firstRun, lastRun=lastRun)
0271 lumiStr = goodLumiSecStr = ""
0272 if lumiSecExtend:
0273 goodLumiSecStr = "lumiSecs = cms.untracked.VLuminosityBlockRange()\n"
0274 lumiStr = " lumisToProcess = lumiSecs,\n"
0275
0276 files = self.__fileListSnippet(crab=crab, parent=parent, firstRun=firstRun, lastRun=lastRun, forcerunselection=False)
0277
0278 theMap = repMap
0279 theMap["files"] = files
0280 theMap["json"] = jsonPath
0281 theMap["lumiStr"] = lumiStr
0282 theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
0283 theMap["lumiSecExtend"] = lumiSecExtend
0284 if crab:
0285 dataset_snippet = self.__dummy_source_template%( theMap )
0286 else:
0287 dataset_snippet = self.__source_template%( theMap )
0288 return dataset_snippet
0289
0290 def __find_lt( self, a, x ):
0291 'Find rightmost value less than x'
0292 i = bisect.bisect_left( a, x )
0293 if i:
0294 return i-1
0295 raise ValueError
0296
0297 def __find_ge( self, a, x):
0298 'Find leftmost item greater than or equal to x'
0299 i = bisect.bisect_left( a, x )
0300 if i != len( a ):
0301 return i
0302 raise ValueError
0303
0304 def __findInJson(self, jsondict, strings):
0305 if isinstance(strings, str):
0306 strings = [ strings ]
0307
0308 if len(strings) == 0:
0309 return jsondict
0310 if isinstance(jsondict,dict):
0311 if strings[0] in jsondict:
0312 try:
0313 return self.__findInJson(jsondict[strings[0]], strings[1:])
0314 except KeyError:
0315 pass
0316 else:
0317 for a in jsondict:
0318 if strings[0] in a:
0319 try:
0320 return self.__findInJson(a[strings[0]], strings[1:])
0321 except (TypeError, KeyError):
0322 pass
0323
0324 raise KeyError("Can't find " + strings[0])
0325
0326 def forcerunrange(self, firstRun, lastRun, s):
0327 """s must be in the format run1:lum1-run2:lum2"""
0328 s = s.group()
0329 run1 = s.split("-")[0].split(":")[0]
0330 lum1 = s.split("-")[0].split(":")[1]
0331 try:
0332 run2 = s.split("-")[1].split(":")[0]
0333 lum2 = s.split("-")[1].split(":")[1]
0334 except IndexError:
0335 run2 = run1
0336 lum2 = lum1
0337 if int(run2) < firstRun or int(run1) > lastRun:
0338 return ""
0339 if int(run1) < firstRun or firstRun < 0:
0340 run1 = firstRun
0341 lum1 = 1
0342 if int(run2) > lastRun:
0343 run2 = lastRun
0344 lum2 = "max"
0345 if int(run1) < self.__firstusedrun or self.__firstusedrun < 0:
0346 self.__firstusedrun = int(run1)
0347 if int(run2) > self.__lastusedrun:
0348 self.__lastusedrun = int(run2)
0349 return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)
0350
0351 def getForceRunRangeFunction(self, firstRun, lastRun):
0352 def forcerunrangefunction(s):
0353 return self.forcerunrange(firstRun, lastRun, s)
0354 return forcerunrangefunction
0355
0356 def __getData( self, dasQuery, dasLimit = 0 ):
0357 dasData = das_client.get_data(dasQuery, dasLimit)
0358 if isinstance(dasData, str):
0359 jsondict = json.loads( dasData )
0360 else:
0361 jsondict = dasData
0362
0363 try:
0364 error = self.__findInJson(jsondict,["data","error"])
0365 except KeyError:
0366 error = None
0367 if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
0368 try:
0369 jsonstr = self.__findInJson(jsondict,"reason")
0370 except KeyError:
0371 jsonstr = str(jsondict)
0372 if len(jsonstr) > 10000:
0373 jsonfile = "das_query_output_%i.txt"
0374 i = 0
0375 while os.path.lexists(jsonfile % i):
0376 i += 1
0377 jsonfile = jsonfile % i
0378 theFile = open( jsonfile, "w" )
0379 theFile.write( jsonstr )
0380 theFile.close()
0381 msg = "The DAS query returned an error. The output is very long, and has been stored in:\n" + jsonfile
0382 else:
0383 msg = "The DAS query returned a error. Here is the output\n" + jsonstr
0384 msg += "\nIt's possible that this was a server error. If so, it may work if you try again later"
0385 raise AllInOneError(msg)
0386 return self.__findInJson(jsondict,"data")
0387
0388 def __getDataType( self ):
0389 if self.__predefined:
0390 with open(self.__filename) as f:
0391 datatype = None
0392 for line in f.readlines():
0393 if line.startswith("#data type: "):
0394 if datatype is not None:
0395 raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
0396 datatype = line.replace("#data type: ", "").replace("\n","")
0397 return datatype
0398 return "unknown"
0399
0400 dasQuery_type = ( 'dataset dataset=%s instance=%s detail=true | grep dataset.datatype,'
0401 'dataset.name'%( self.__name, self.__dasinstance ) )
0402 data = self.__getData( dasQuery_type )
0403
0404 try:
0405 return self.__findInJson(data, ["dataset", "datatype"])
0406 except KeyError:
0407 print ("Cannot find the datatype of the dataset '%s'\n"
0408 "It may not be possible to automatically find the magnetic field,\n"
0409 "and you will not be able run in CRAB mode"
0410 %( self.name() ))
0411 return "unknown"
0412
0413 def __getParentDataset( self ):
0414 dasQuery = "parent dataset=" + self.__name + " instance="+self.__dasinstance
0415 data = self.__getData( dasQuery )
0416 try:
0417 return self.__findInJson(data, ["parent", "name"])
0418 except KeyError:
0419 raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"
0420 "Here is the DAS output:\n" + str(jsondict) +
0421 "\nIt's possible that this was a server error. If so, it may work if you try again later")
0422
0423 def __getMagneticField( self ):
0424 Bfieldlocation = os.path.join( self.__cmssw, "python", "Configuration", "StandardSequences" )
0425 if not os.path.isdir(Bfieldlocation):
0426 Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )
0427 Bfieldlist = [ f.replace("_cff.py",'') \
0428 for f in os.listdir(Bfieldlocation) \
0429 if f.startswith("MagneticField_") and f.endswith("_cff.py") ]
0430 Bfieldlist.sort( key = lambda Bfield: -len(Bfield) )
0431
0432 if self.__inputMagneticField is not None:
0433 if self.__inputMagneticField == 3.8:
0434 return "MagneticField"
0435 elif self.__inputMagneticField == 0:
0436 return "MagneticField_0T"
0437 else:
0438 raise ValueError("Unknown input magnetic field {}".format(self.__inputMagneticField))
0439
0440 if self.__predefined:
0441 with open(self.__filename) as f:
0442 datatype = None
0443 Bfield = None
0444 for line in f.readlines():
0445 if line.startswith("#data type: "):
0446 if datatype is not None:
0447 raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
0448 datatype = line.replace("#data type: ", "").replace("\n","")
0449 datatype = datatype.split("#")[0].strip()
0450 if line.startswith("#magnetic field: "):
0451 if Bfield is not None:
0452 raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
0453 Bfield = line.replace("#magnetic field: ", "").replace("\n","")
0454 Bfield = Bfield.split("#")[0].strip()
0455 if Bfield is not None:
0456 Bfield = Bfield.split(",")[0]
0457 if Bfield in Bfieldlist or Bfield == "unknown":
0458 return Bfield
0459 else:
0460 print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
0461 print("Using Bfield='unknown' - this will revert to the default")
0462 return "unknown"
0463 elif datatype == "data":
0464 return "MagneticField"
0465 else:
0466 return "unknown"
0467
0468 if self.__dataType == "data":
0469 return "MagneticField"
0470
0471
0472
0473 dasQuery_B = ('dataset dataset=%s instance=%s'%(self.__name, self.__dasinstance))
0474 data = self.__getData( dasQuery_B )
0475
0476 try:
0477 Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])
0478 if Bfield in Bfieldlist:
0479 return Bfield
0480 elif Bfield == "38T" or Bfield == "38T_PostLS1":
0481 return "MagneticField"
0482 elif "MagneticField_" + Bfield in Bfieldlist:
0483 return "MagneticField_" + Bfield
0484 elif Bfield == "":
0485 pass
0486 else:
0487 print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
0488 print("Using Bfield='unknown' - this will revert to the default magnetic field")
0489 return "unknown"
0490 except KeyError:
0491 pass
0492
0493 for possibleB in Bfieldlist:
0494 if (possibleB != "MagneticField"
0495 and possibleB.replace("MagneticField_","") in self.__name.replace("TkAlCosmics0T", "")):
0496
0497
0498 if possibleB == "MagneticField_38T" or possibleB == "MagneticField_38T_PostLS1":
0499 return "MagneticField"
0500 return possibleB
0501
0502 return "unknown"
0503
0504 def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):
0505 """For MC, this returns the same as the previous function.
0506 For data, it gets the magnetic field from the runs. This is important for
0507 deciding which template to use for offlinevalidation
0508 """
0509 if self.__dataType == "mc" and self.__magneticField == "MagneticField":
0510 return 3.8
0511 if self.__inputMagneticField is not None:
0512 return self.__inputMagneticField
0513 if "T" in self.__magneticField:
0514 Bfield = self.__magneticField.split("T")[0].replace("MagneticField_","")
0515 try:
0516 return float(Bfield) / 10.0
0517 except ValueError:
0518 pass
0519 if self.__predefined:
0520 with open(self.__filename) as f:
0521 Bfield = None
0522 for line in f.readlines():
0523 if line.startswith("#magnetic field: ") and "," in line:
0524 if Bfield is not None:
0525 raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
0526 return float(line.replace("#magnetic field: ", "").split(",")[1].split("#")[0].strip())
0527
0528 if run > 0:
0529 dasQuery = ('run=%s instance=%s detail=true'%(run, self.__dasinstance))
0530 data = self.__getData(dasQuery)
0531 try:
0532 return self.__findInJson(data, ["run","bfield"])
0533 except KeyError:
0534 return "unknown Can't get the magnetic field for run %s from DAS" % run
0535
0536
0537
0538
0539 if self.__firstusedrun is None or self.__lastusedrun is None:
0540 return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."
0541 firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)
0542 lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)
0543 try:
0544 if abs(firstrunB - lastrunB) <= tolerance:
0545 return .5*(firstrunB + lastrunB)
0546 print(firstrunB, lastrunB, tolerance)
0547 return ("unknown The beginning and end of your run range for %s\n"
0548 "have different magnetic fields (%s, %s)!\n"
0549 "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"
0550 "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)
0551 except TypeError:
0552 try:
0553 if "unknown" in firstrunB:
0554 return firstrunB
0555 else:
0556 return lastrunB
0557 except TypeError:
0558 return lastrunB
0559
0560 @cache
0561 def __getFileInfoList( self, dasLimit, parent = False ):
0562 if self.__predefined:
0563 if parent:
0564 extendstring = "secFiles.extend"
0565 else:
0566 extendstring = "readFiles.extend"
0567 with open(self.__fileName) as f:
0568 files = []
0569 copy = False
0570 for line in f.readlines():
0571 if "]" in line:
0572 copy = False
0573 if copy:
0574 files.append({name: line.translate(None, "', " + '"')})
0575 if extendstring in line and "[" in line and "]" not in line:
0576 copy = True
0577 return files
0578
0579 if parent:
0580 searchdataset = self.parentDataset()
0581 else:
0582 searchdataset = self.__name
0583 dasQuery_files = ( 'file dataset=%s instance=%s detail=true | grep file.name, file.nevents, '
0584 'file.creation_time, '
0585 'file.modification_time'%( searchdataset, self.__dasinstance ) )
0586 print("Requesting file information for '%s' from DAS..."%( searchdataset ), end=' ')
0587 sys.stdout.flush()
0588 data = self.__getData( dasQuery_files, dasLimit )
0589 print("Done.")
0590 data = [ self.__findInJson(entry,"file") for entry in data ]
0591 if len( data ) == 0:
0592 msg = ("No files are available for the dataset '%s'. This can be "
0593 "due to a typo or due to a DAS problem. Please check the "
0594 "spelling of the dataset and/or retry to run "
0595 "'validateAlignments.py'."%( self.name() ))
0596 raise AllInOneError( msg )
0597 fileInformationList = []
0598 for file in data:
0599 fileName = 'unknown'
0600 try:
0601 fileName = self.__findInJson(file, "name")
0602 fileCreationTime = self.__findInJson(file, "creation_time")
0603 fileNEvents = self.__findInJson(file, "nevents")
0604 except KeyError:
0605 print(("DAS query gives bad output for file '%s'. Skipping it.\n"
0606 "It may work if you try again later.") % fileName)
0607 fileNEvents = 0
0608
0609 if fileNEvents == 0:
0610 continue
0611 fileDict = { "name": fileName,
0612 "creation_time": fileCreationTime,
0613 "nevents": fileNEvents
0614 }
0615 fileInformationList.append( fileDict )
0616 fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )
0617 return fileInformationList
0618
0619 @cache
0620 def __getRunList( self ):
0621 dasQuery_runs = ( 'run dataset=%s instance=%s | grep run.run_number,'
0622 'run.creation_time'%( self.__name, self.__dasinstance ) )
0623 print("Requesting run information for '%s' from DAS..."%( self.__name ), end=' ')
0624 sys.stdout.flush()
0625 data = self.__getData( dasQuery_runs )
0626 print("Done.")
0627 data = [ self.__findInJson(entry,"run") for entry in data ]
0628 data.sort( key = lambda run: self.__findInJson(run, "run_number") )
0629 return data
0630
0631 def __datetime(self, stringForDas):
0632 if len(stringForDas) != 8:
0633 raise AllInOneError(stringForDas + " is not a valid date string.\n"
0634 + "DAS accepts dates in the form 'yyyymmdd'")
0635 year = stringForDas[:4]
0636 month = stringForDas[4:6]
0637 day = stringForDas[6:8]
0638 return datetime.date(int(year), int(month), int(day))
0639
0640 def __dateString(self, date):
0641 return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)
0642
0643 def convertTimeToRun( self, begin = None, end = None,
0644 firstRun = None, lastRun = None,
0645 shortTuple = True ):
0646 if ( begin and firstRun ) or ( end and lastRun ):
0647 msg = ( "The Usage of "
0648 + "'begin' & 'firstRun' " * int( bool( begin and
0649 firstRun ) )
0650 + "and " * int( bool( ( begin and firstRun ) and
0651 ( end and lastRun ) ) )
0652 + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
0653 + "is ambigous." )
0654 raise AllInOneError( msg )
0655
0656 if begin or end:
0657 runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]
0658
0659 if begin:
0660 lastdate = begin
0661 for delta in [ 1, 5, 10, 20, 30 ]:
0662 firstdate = lastdate
0663 lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))
0664 dasQuery_begin = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
0665 begindata = self.__getData(dasQuery_begin)
0666 if len(begindata) > 0:
0667 begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
0668 try:
0669 runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))
0670 except ValueError:
0671 msg = ( "Your 'begin' is after the creation time of the last "
0672 "run in the dataset\n'%s'"%( self.__name ) )
0673 raise AllInOneError( msg )
0674 firstRun = runList[runIndex]
0675 begin = None
0676 break
0677
0678 if begin:
0679 raise AllInOneError("No runs within a reasonable time interval after your 'begin'."
0680 "Try using a 'begin' that has runs soon after it (within 2 months at most)")
0681
0682 if end:
0683 firstdate = end
0684 for delta in [ 1, 5, 10, 20, 30 ]:
0685 lastdate = firstdate
0686 firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))
0687 dasQuery_end = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
0688 enddata = self.__getData(dasQuery_end)
0689 if len(enddata) > 0:
0690 enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
0691 try:
0692 runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))
0693 except ValueError:
0694 msg = ( "Your 'end' is before the creation time of the first "
0695 "run in the dataset\n'%s'"%( self.__name ) )
0696 raise AllInOneError( msg )
0697 lastRun = runList[runIndex]
0698 end = None
0699 break
0700
0701 if end:
0702 raise AllInOneError("No runs within a reasonable time interval before your 'end'."
0703 "Try using an 'end' that has runs soon before it (within 2 months at most)")
0704
0705 if shortTuple:
0706 return firstRun, lastRun
0707 else:
0708 return begin, end, firstRun, lastRun
0709
0710 def dataType( self ):
0711 if not self.__dataType:
0712 self.__dataType = self.__getDataType()
0713 return self.__dataType
0714
0715 def magneticField( self ):
0716 if not self.__magneticField:
0717 self.__magneticField = self.__getMagneticField()
0718 return self.__magneticField
0719
0720 def magneticFieldForRun( self, run = -1 ):
0721 return self.__getMagneticFieldForRun(run)
0722
0723 def parentDataset( self ):
0724 if not self.__parentDataset:
0725 self.__parentDataset = self.__getParentDataset()
0726 return self.__parentDataset
0727
0728 def datasetSnippet( self, jsonPath = None, begin = None, end = None,
0729 firstRun = None, lastRun = None, crab = False, parent = False ):
0730 if not firstRun: firstRun = None
0731 if not lastRun: lastRun = None
0732 if not begin: begin = None
0733 if not end: end = None
0734 if self.__predefined and (jsonPath or begin or end or firstRun or lastRun):
0735 msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun' "
0736 "only work for official datasets, not predefined _cff.py files" )
0737 raise AllInOneError( msg )
0738 if self.__predefined and parent:
0739 with open(self.__filename) as f:
0740 if "secFiles.extend" not in f.read():
0741 msg = ("The predefined dataset '%s' does not contain secondary files, "
0742 "which your validation requires!") % self.__name
0743 if self.__official:
0744 self.__name = self.__origName
0745 self.__predefined = False
0746 print(msg)
0747 print ("Retreiving the files from DAS. You will be asked if you want "
0748 "to overwrite the old dataset.\n"
0749 "It will still be compatible with validations that don't need secondary files.")
0750 else:
0751 raise AllInOneError(msg)
0752
0753 if self.__predefined:
0754 snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
0755 "process.maxEvents = cms.untracked.PSet(\n"
0756 " input = cms.untracked.int32(int(.oO[nEvents]Oo. / .oO[parallelJobs]Oo.))\n"
0757 ")\n"
0758 "process.source.skipEvents=cms.untracked.uint32(int(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.))"
0759 %(self.__name))
0760 if not parent:
0761 with open(self.__filename) as f:
0762 if "secFiles.extend" in f.read():
0763 snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"
0764 return snippet
0765 theMap = { "process": "process.",
0766 "tab": " " * len( "process." ),
0767 "nEvents": ".oO[nEvents]Oo. / .oO[parallelJobs]Oo.",
0768 "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(int(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.))\n",
0769 "importCms": "",
0770 "header": ""
0771 }
0772 datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
0773 begin = begin,
0774 end = end,
0775 firstRun = firstRun,
0776 lastRun = lastRun,
0777 repMap = theMap,
0778 crab = crab,
0779 parent = parent )
0780 if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":
0781 try:
0782 self.dump_cff(parent = parent)
0783 except AllInOneError as e:
0784 print("Can't store the dataset as a cff:")
0785 print(e)
0786 print("This may be inconvenient in the future, but will not cause a problem for this validation.")
0787 return datasetSnippet
0788
0789 @cache
0790 def dump_cff( self, outName = None, jsonPath = None, begin = None,
0791 end = None, firstRun = None, lastRun = None, parent = False ):
0792 if outName == None:
0793 outName = "Dataset" + self.__name.replace("/", "_")
0794 packageName = os.path.join( "Alignment", "OfflineValidation" )
0795 if not os.path.exists( os.path.join(
0796 self.__cmssw, "src", packageName ) ):
0797 msg = ("You try to store the predefined dataset'%s'.\n"
0798 "For that you need to check out the package '%s' to your "
0799 "private relase area in\n"%( outName, packageName )
0800 + self.__cmssw )
0801 raise AllInOneError( msg )
0802 theMap = { "process": "",
0803 "tab": "",
0804 "nEvents": str( -1 ),
0805 "skipEventsString": "",
0806 "importCms": "import FWCore.ParameterSet.Config as cms\n",
0807 "header": "#Do not delete or (unless you know what you're doing) change these comments\n"
0808 "#%(name)s\n"
0809 "#data type: %(dataType)s\n"
0810 "#magnetic field: .oO[magneticField]Oo.\n"
0811 %{"name": self.__name,
0812 "dataType": self.__dataType}
0813 }
0814 dataset_cff = self.__createSnippet( jsonPath = jsonPath,
0815 begin = begin,
0816 end = end,
0817 firstRun = firstRun,
0818 lastRun = lastRun,
0819 repMap = theMap,
0820 parent = parent)
0821 magneticField = self.__magneticField
0822 if magneticField == "MagneticField":
0823 magneticField = "%s, %s #%s" % (magneticField,
0824 str(self.__getMagneticFieldForRun()).replace("\n"," ").split("#")[0].strip(),
0825 "Use MagneticField_cff.py; the number is for determining which track selection to use."
0826 )
0827 dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)
0828 filePath = os.path.join( self.__cmssw, "src", packageName,
0829 "python", outName + "_cff.py" )
0830 if os.path.exists( filePath ):
0831 existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
0832 askString = "Do you want to overwrite it? [y/n]\n"
0833 inputQuery = existMsg + askString
0834 while True:
0835 userInput = raw_input( inputQuery ).lower()
0836 if userInput == "y":
0837 break
0838 elif userInput == "n":
0839 return
0840 else:
0841 inputQuery = askString
0842 print ( "The predefined dataset '%s' will be stored in the file\n"
0843 %( outName )
0844 + filePath +
0845 "\nFor future use you have to do 'scram b'." )
0846 print()
0847 theFile = open( filePath, "w" )
0848 theFile.write( dataset_cff )
0849 theFile.close()
0850 return
0851
0852 def createdatasetfile_hippy(self, filename, filesperjob, firstrun, lastrun):
0853 with open(filename, "w") as f:
0854 for job in self.__chunks(self.fileList(firstRun=firstrun, lastRun=lastrun, forcerunselection=True), filesperjob):
0855 f.write(",".join("'{}'".format(file) for file in job)+"\n")
0856
0857 @staticmethod
0858 def getrunnumberfromfilename(filename):
0859 parts = filename.split("/")
0860 result = error = None
0861 if parts[0] != "" or parts[1] != "store":
0862 error = "does not start with /store"
0863 elif parts[2] in ["mc", "relval"]:
0864 result = 1
0865 elif not parts[-1].endswith(".root"):
0866 error = "does not end with something.root"
0867 elif len(parts) != 12:
0868 error = "should be exactly 11 slashes counting the first one"
0869 else:
0870 runnumberparts = parts[-5:-2]
0871 if not all(len(part)==3 for part in runnumberparts):
0872 error = "the 3 directories {} do not have length 3 each".format("/".join(runnumberparts))
0873 try:
0874 result = int("".join(runnumberparts))
0875 except ValueError:
0876 error = "the 3 directories {} do not form an integer".format("/".join(runnumberparts))
0877
0878 if error:
0879 error = "could not figure out which run number this file is from:\n{}\n{}".format(filename, error)
0880 raise AllInOneError(error)
0881
0882 return result
0883
0884 @cache
0885 def fileList(self, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
0886 fileList = [ self.__findInJson(fileInfo,"name")
0887 for fileInfo in self.fileInfoList(parent) ]
0888
0889 if firstRun or lastRun:
0890 if not firstRun: firstRun = -1
0891 if not lastRun: lastRun = float('infinity')
0892 unknownfilenames, reasons = [], set()
0893 for filename in fileList[:]:
0894 try:
0895 if not firstRun <= self.getrunnumberfromfilename(filename) <= lastRun:
0896 fileList.remove(filename)
0897 except AllInOneError as e:
0898 if forcerunselection: raise
0899 unknownfilenames.append(e.message.split("\n")[1])
0900 reasons .add (e.message.split("\n")[2])
0901 if reasons:
0902 if len(unknownfilenames) == len(fileList):
0903 print("Could not figure out the run numbers of any of the filenames for the following reason(s):")
0904 else:
0905 print("Could not figure out the run numbers of the following filenames:")
0906 for filename in unknownfilenames:
0907 print(" "+filename)
0908 print("for the following reason(s):")
0909 for reason in reasons:
0910 print(" "+reason)
0911 print("Using the files anyway. The runs will be filtered at the CMSSW level.")
0912 return fileList
0913
0914 def fileInfoList( self, parent = False ):
0915 return self.__getFileInfoList( self.__dasLimit, parent )
0916
0917 def name( self ):
0918 return self.__name
0919
0920 def predefined( self ):
0921 return self.__predefined
0922
0923 @cache
0924 def runList( self ):
0925 return self.__getRunList()
0926
0927
0928 if __name__ == '__main__':
0929 print("Start testing...")
0930 datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
0931 jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
0932 'Collisions12/8TeV/Prompt/'
0933 'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
0934 dataset = Dataset( datasetName )
0935 print(dataset.datasetSnippet( jsonPath = jsonFile,
0936 firstRun = "207800",
0937 end = "20121128"))
0938 dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
0939 jsonPath = jsonFile,
0940 firstRun = "207800",
0941 end = "20121128" )