Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-03-17 10:40:23

0001 from __future__ import print_function
0002 from __future__ import absolute_import
0003 # idea stolen from:
0004 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/
0005 #        PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py
0006 from builtins import range
0007 import bisect
0008 import datetime
0009 import json
0010 import os
0011 import re
0012 import sys
0013 
0014 import Utilities.General.cmssw_das_client as das_client
0015 from FWCore.PythonUtilities.LumiList import LumiList
0016 
0017 from .helperFunctions import cache
0018 from .TkAlExceptions import AllInOneError
0019 
0020 class Dataset(object):
0021     def __init__( self, datasetName, dasLimit = 0, tryPredefinedFirst = True,
0022                   cmssw = os.environ["CMSSW_BASE"], cmsswrelease = os.environ["CMSSW_RELEASE_BASE"],
0023                   magneticfield = None, dasinstance = None):
0024         self.__name = datasetName
0025         self.__origName = datasetName
0026         self.__dasLimit = dasLimit
0027         self.__dasinstance = dasinstance
0028         self.__cmssw = cmssw
0029         self.__cmsswrelease = cmsswrelease
0030         self.__firstusedrun = None
0031         self.__lastusedrun = None
0032         self.__parentDataset = None
0033 
0034         # check, if dataset name matches CMS dataset naming scheme
0035         if re.match( r'/.+/.+/.+', self.__name ):
0036             self.__official = True
0037             fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"
0038         else:
0039             self.__official = False
0040             fileName = self.__name + "_cff.py"
0041 
0042         searchPath1 = os.path.join( self.__cmssw, "python",
0043                                     "Alignment", "OfflineValidation",
0044                                     fileName )
0045         searchPath2 = os.path.join( self.__cmssw, "src",
0046                                     "Alignment", "OfflineValidation",
0047                                     "python", fileName )
0048         searchPath3 = os.path.join( self.__cmsswrelease,
0049                                     "python", "Alignment",
0050                                     "OfflineValidation", fileName )
0051         if self.__official and not tryPredefinedFirst:
0052             self.__predefined = False
0053         elif os.path.exists( searchPath1 ):
0054             self.__predefined = True
0055             self.__filename = searchPath1
0056         elif os.path.exists( searchPath2 ):
0057             msg = ("The predefined dataset '%s' does exist in '%s', but "
0058                    "you need to run 'scram b' first."
0059                    %( self.__name, searchPath2 ))
0060             if self.__official:
0061                 print(msg)
0062                 print("Getting the data from DAS again.  To go faster next time, run scram b.")
0063             else:
0064                 raise AllInOneError( msg )
0065         elif os.path.exists( searchPath3 ):
0066             self.__predefined = True
0067             self.__filename = searchPath3
0068         elif self.__official:
0069             self.__predefined = False
0070         else:
0071             msg = ("The predefined dataset '%s' does not exist. Please "
0072                    "create it first or check for typos."%( self.__name ))
0073             raise AllInOneError( msg )
0074 
0075         if self.__predefined and self.__official:
0076             self.__name = "Dataset" + self.__name.replace("/","_")
0077 
0078         if magneticfield is not None:
0079             try:
0080                 magneticfield = float(magneticfield)
0081             except ValueError:
0082                 raise AllInOneError("Bad magneticfield {} which can't be converted to float".format(magneticfield))
0083         self.__inputMagneticField = magneticfield
0084 
0085         self.__dataType = self.__getDataType()
0086         self.__magneticField = self.__getMagneticField()
0087 
0088 
0089     def __chunks( self, theList, n ):
0090         """ Yield successive n-sized chunks from theList.
0091         """
0092         for i in range( 0, len( theList ), n ):
0093             yield theList[i:i+n]
0094 
0095     __source_template= ("%(header)s"
0096                         "%(importCms)s"
0097                         "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
0098                         "%(goodLumiSecStr)s"
0099                         "readFiles = cms.untracked.vstring()\n"
0100                         "secFiles = cms.untracked.vstring()\n"
0101                         "%(process)ssource = cms.Source(\"PoolSource\",\n"
0102                         "%(lumiStr)s"
0103                         "%(tab)s                    secondaryFileNames ="
0104                         "secFiles,\n"
0105                         "%(tab)s                    fileNames = readFiles\n"
0106                         ")\n"
0107                         "%(files)s\n"
0108                         "%(lumiSecExtend)s\n"
0109                         "%(process)smaxEvents = cms.untracked.PSet( "
0110                         "input = cms.untracked.int32(int(%(nEvents)s)) )\n"
0111                         "%(skipEventsString)s\n")
0112 
0113     __dummy_source_template = ("readFiles = cms.untracked.vstring()\n"
0114                                "secFiles = cms.untracked.vstring()\n"
0115                                "%(process)ssource = cms.Source(\"PoolSource\",\n"
0116                                "%(tab)s                    secondaryFileNames ="
0117                                "secFiles,\n"
0118                                "%(tab)s                    fileNames = readFiles\n"
0119                                ")\n"
0120                                "readFiles.extend(['dummy_File.root'])\n"
0121                                "%(process)smaxEvents = cms.untracked.PSet( "
0122                                "input = cms.untracked.int32(int(%(nEvents)s)) )\n"
0123                                "%(skipEventsString)s\n")
0124 
0125     def __lumiSelectionSnippet( self, jsonPath = None, firstRun = None, lastRun = None ):
0126         lumiSecExtend = ""
0127         if firstRun or lastRun or jsonPath:
0128             if not jsonPath:
0129                 selectedRunList = self.__getRunList()
0130                 if firstRun:
0131                     selectedRunList = [ run for run in selectedRunList \
0132                                         if self.__findInJson(run, "run_number") >= firstRun ]
0133                 if lastRun:
0134                     selectedRunList = [ run for run in selectedRunList \
0135                                         if self.__findInJson(run, "run_number") <= lastRun ]
0136                 lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \
0137                              + str( self.__findInJson(run, "run_number") ) + ":max" \
0138                              for run in selectedRunList ]
0139                 splitLumiList = list( self.__chunks( lumiList, 255 ) )
0140             else:
0141                 theLumiList = None
0142                 try:
0143                     theLumiList = LumiList ( filename = jsonPath )
0144                 except ValueError:
0145                     pass
0146 
0147                 if theLumiList is not None:
0148                     allRuns = theLumiList.getRuns()
0149                     runsToRemove = []
0150                     for run in allRuns:
0151                         if firstRun and int( run ) < firstRun:
0152                             runsToRemove.append( run )
0153                         if lastRun and int( run ) > lastRun:
0154                             runsToRemove.append( run )
0155                     theLumiList.removeRuns( runsToRemove )
0156                     splitLumiList = list( self.__chunks(
0157                         theLumiList.getCMSSWString().split(','), 255 ) )
0158                     if not (splitLumiList and splitLumiList[0] and splitLumiList[0][0]):
0159                         splitLumiList = None
0160                 else:
0161                     with open(jsonPath) as f:
0162                         jsoncontents = f.read()
0163                         if "process.source.lumisToProcess" in jsoncontents:
0164                             msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet.  Trying to use it" % jsonPath
0165                             if firstRun or lastRun:
0166                                 msg += ("\n  (after applying firstRun and/or lastRun)")
0167                             msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."
0168                             msg += "\nCheck your config file to make sure that it worked properly."
0169                             print(msg)
0170 
0171                             runlist = self.__getRunList()
0172                             if firstRun or lastRun:
0173                                 self.__firstusedrun = -1
0174                                 self.__lastusedrun = -1
0175                                 jsoncontents = re.sub(r"\d+:(\d+|max)(-\d+:(\d+|max))?", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)
0176                                 jsoncontents = (jsoncontents.replace("'',\n","").replace("''\n","")
0177                                                             .replace('"",\n','').replace('""\n',''))
0178                                 self.__firstusedrun = max(self.__firstusedrun, int(self.__findInJson(runlist[0],"run_number")))
0179                                 self.__lastusedrun = min(self.__lastusedrun, int(self.__findInJson(runlist[-1],"run_number")))
0180                                 if self.__lastusedrun < self.__firstusedrun:
0181                                     jsoncontents = None
0182                             else:
0183                                 self.__firstusedrun = int(self.__findInJson(runlist[0],"run_number"))
0184                                 self.__lastusedrun = int(self.__findInJson(runlist[-1],"run_number"))
0185                             lumiSecExtend = jsoncontents
0186                             splitLumiList = None
0187                         else:
0188                             raise AllInOneError("%s is not a valid json file!" % jsonPath)
0189 
0190             if splitLumiList and splitLumiList[0] and splitLumiList[0][0]:
0191                 lumiSecStr = [ "',\n'".join( lumis ) \
0192                                for lumis in splitLumiList ]
0193                 lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
0194                                for lumis in lumiSecStr ]
0195                 lumiSecExtend = "\n".join( lumiSecStr )
0196                 runlist = self.__getRunList()
0197                 self.__firstusedrun = max(int(splitLumiList[0][0].split(":")[0]), int(self.__findInJson(runlist[0],"run_number")))
0198                 self.__lastusedrun = min(int(splitLumiList[-1][-1].split(":")[0]), int(self.__findInJson(runlist[-1],"run_number")))
0199             elif lumiSecExtend:
0200                 pass
0201             else:
0202                 msg = "You are trying to run a validation without any runs!  Check that:"
0203                 if firstRun or lastRun:
0204                     msg += "\n - firstRun/begin and lastRun/end are correct for this dataset, and there are runs in between containing data"
0205                 if jsonPath:
0206                     msg += "\n - your JSON file is correct for this dataset, and the runs contain data"
0207                 if (firstRun or lastRun) and jsonPath:
0208                     msg += "\n - firstRun/begin and lastRun/end are consistent with your JSON file"
0209                 raise AllInOneError(msg)
0210 
0211         else:
0212             if self.__inputMagneticField is not None:
0213                 pass  #never need self.__firstusedrun or self.__lastusedrun
0214             else:
0215                 runlist = self.__getRunList()
0216                 self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number"))
0217                 self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number"))
0218 
0219         return lumiSecExtend
0220 
0221     def __fileListSnippet(self, crab=False, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
0222         if crab:
0223             files = ""
0224         else:
0225             splitFileList = list( self.__chunks( self.fileList(firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
0226             if not splitFileList:
0227                 raise AllInOneError("No files found for dataset {}.  Check the spelling, or maybe specify another das instance?".format(self.__name))
0228             fileStr = [ "',\n'".join( files ) for files in splitFileList ]
0229             fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
0230                         for files in fileStr ]
0231             files = "\n".join( fileStr )
0232 
0233             if parent:
0234                 splitParentFileList = list( self.__chunks( self.fileList(parent=True, firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
0235                 parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]
0236                 parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \
0237                             for parentFiles in parentFileStr ]
0238                 parentFiles = "\n".join( parentFileStr )
0239                 files += "\n\n" + parentFiles
0240 
0241         return files
0242 
0243     def __createSnippet( self, jsonPath = None, begin = None, end = None,
0244                          firstRun = None, lastRun = None, repMap = None,
0245                          crab = False, parent = False ):
0246 
0247         if firstRun:
0248             firstRun = int( firstRun )
0249         if lastRun:
0250             lastRun = int( lastRun )
0251         if ( begin and firstRun ) or ( end and lastRun ):
0252             msg = ( "The Usage of "
0253                     + "'begin' & 'firstRun' " * int( bool( begin and
0254                                                            firstRun ) )
0255                     + "and " * int( bool( ( begin and firstRun ) and
0256                                          ( end and lastRun ) ) )
0257                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
0258                     + "is ambigous." )
0259             raise AllInOneError( msg )
0260         if begin or end:
0261             ( firstRun, lastRun ) = self.convertTimeToRun(
0262                 begin = begin, end = end, firstRun = firstRun,
0263                 lastRun = lastRun )
0264         if ( firstRun and lastRun ) and ( firstRun > lastRun ):
0265             msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
0266                     "chosen is greater than the upper time/runrange limit "
0267                     "('end'/'lastRun').")
0268             raise AllInOneError( msg )
0269 
0270         lumiSecExtend = self.__lumiSelectionSnippet(jsonPath=jsonPath, firstRun=firstRun, lastRun=lastRun)
0271         lumiStr = goodLumiSecStr = ""
0272         if lumiSecExtend:
0273             goodLumiSecStr = "lumiSecs = cms.untracked.VLuminosityBlockRange()\n"
0274             lumiStr = "                    lumisToProcess = lumiSecs,\n"
0275 
0276         files = self.__fileListSnippet(crab=crab, parent=parent, firstRun=firstRun, lastRun=lastRun, forcerunselection=False)
0277 
0278         theMap = repMap
0279         theMap["files"] = files
0280         theMap["json"] = jsonPath
0281         theMap["lumiStr"] = lumiStr
0282         theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
0283         theMap["lumiSecExtend"] = lumiSecExtend
0284         if crab:
0285             dataset_snippet = self.__dummy_source_template%( theMap )
0286         else:
0287             dataset_snippet = self.__source_template%( theMap )
0288         return dataset_snippet
0289 
0290     def __find_lt( self, a, x ):
0291         'Find rightmost value less than x'
0292         i = bisect.bisect_left( a, x )
0293         if i:
0294             return i-1
0295         raise ValueError
0296 
0297     def __find_ge( self, a, x):
0298         'Find leftmost item greater than or equal to x'
0299         i = bisect.bisect_left( a, x )
0300         if i != len( a ):
0301             return i
0302         raise ValueError
0303 
0304     def __findInJson(self, jsondict, strings):
0305         if isinstance(strings, str):
0306             strings = [ strings ]
0307 
0308         if len(strings) == 0:
0309             return jsondict
0310         if isinstance(jsondict,dict):
0311             if strings[0] in jsondict:
0312                 try:
0313                     return self.__findInJson(jsondict[strings[0]], strings[1:])
0314                 except KeyError:
0315                     pass
0316         else:
0317             for a in jsondict:
0318                 if strings[0] in a:
0319                     try:
0320                         return self.__findInJson(a[strings[0]], strings[1:])
0321                     except (TypeError, KeyError):  #TypeError because a could be a string and contain strings[0]
0322                         pass
0323         #if it's not found
0324         raise KeyError("Can't find " + strings[0])
0325 
0326     def forcerunrange(self, firstRun, lastRun, s):
0327         """s must be in the format run1:lum1-run2:lum2"""
0328         s = s.group()
0329         run1 = s.split("-")[0].split(":")[0]
0330         lum1 = s.split("-")[0].split(":")[1]
0331         try:
0332             run2 = s.split("-")[1].split(":")[0]
0333             lum2 = s.split("-")[1].split(":")[1]
0334         except IndexError:
0335             run2 = run1
0336             lum2 = lum1
0337         if int(run2) < firstRun or int(run1) > lastRun:
0338             return ""
0339         if int(run1) < firstRun or firstRun < 0:
0340             run1 = firstRun
0341             lum1 = 1
0342         if int(run2) > lastRun:
0343             run2 = lastRun
0344             lum2 = "max"
0345         if int(run1) < self.__firstusedrun or self.__firstusedrun < 0:
0346             self.__firstusedrun = int(run1)
0347         if int(run2) > self.__lastusedrun:
0348             self.__lastusedrun = int(run2)
0349         return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)
0350 
0351     def getForceRunRangeFunction(self, firstRun, lastRun):
0352         def forcerunrangefunction(s):
0353             return self.forcerunrange(firstRun, lastRun, s)
0354         return forcerunrangefunction
0355 
0356     def __getData( self, dasQuery, dasLimit = 0 ):
0357         dasData = das_client.get_data(dasQuery, dasLimit)
0358         if isinstance(dasData, str):
0359             jsondict = json.loads( dasData )
0360         else:
0361             jsondict = dasData
0362         # Check, if the DAS query fails
0363         try:
0364             error = self.__findInJson(jsondict,["data","error"])
0365         except KeyError:
0366             error = None
0367         if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
0368             try:
0369                 jsonstr = self.__findInJson(jsondict,"reason")
0370             except KeyError: 
0371                 jsonstr = str(jsondict)
0372             if len(jsonstr) > 10000:
0373                 jsonfile = "das_query_output_%i.txt"
0374                 i = 0
0375                 while os.path.lexists(jsonfile % i):
0376                     i += 1
0377                 jsonfile = jsonfile % i
0378                 theFile = open( jsonfile, "w" )
0379                 theFile.write( jsonstr )
0380                 theFile.close()
0381                 msg = "The DAS query returned an error.  The output is very long, and has been stored in:\n" + jsonfile
0382             else:
0383                 msg = "The DAS query returned a error.  Here is the output\n" + jsonstr
0384             msg += "\nIt's possible that this was a server error.  If so, it may work if you try again later"
0385             raise AllInOneError(msg)
0386         return self.__findInJson(jsondict,"data")
0387 
0388     def __getDataType( self ):
0389         if self.__predefined:
0390             with open(self.__filename) as f:
0391                 datatype = None
0392                 for line in f.readlines():
0393                     if line.startswith("#data type: "):
0394                         if datatype is not None:
0395                             raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
0396                         datatype = line.replace("#data type: ", "").replace("\n","")
0397                         return datatype
0398                 return "unknown"
0399 
0400         dasQuery_type = ( 'dataset dataset=%s instance=%s detail=true | grep dataset.datatype,'
0401                           'dataset.name'%( self.__name, self.__dasinstance ) )
0402         data = self.__getData( dasQuery_type )
0403 
0404         try:
0405             return self.__findInJson(data, ["dataset", "datatype"])
0406         except KeyError:
0407             print ("Cannot find the datatype of the dataset '%s'\n"
0408                    "It may not be possible to automatically find the magnetic field,\n"
0409                    "and you will not be able run in CRAB mode"
0410                    %( self.name() ))
0411             return "unknown"
0412 
0413     def __getParentDataset( self ):
0414         dasQuery = "parent dataset=" + self.__name + " instance="+self.__dasinstance
0415         data = self.__getData( dasQuery )
0416         try:
0417             return self.__findInJson(data, ["parent", "name"])
0418         except KeyError:
0419             raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"
0420                                 "Here is the DAS output:\n" + str(jsondict) +
0421                                 "\nIt's possible that this was a server error.  If so, it may work if you try again later")
0422 
0423     def __getMagneticField( self ):
0424         Bfieldlocation = os.path.join( self.__cmssw, "python", "Configuration", "StandardSequences" )
0425         if not os.path.isdir(Bfieldlocation):
0426             Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )
0427         Bfieldlist = [ f.replace("_cff.py",'') \
0428                            for f in os.listdir(Bfieldlocation) \
0429                                if f.startswith("MagneticField_") and f.endswith("_cff.py") ]
0430         Bfieldlist.sort( key = lambda Bfield: -len(Bfield) ) #Put it in order of decreasing length, so that searching in the name gives the longer match
0431 
0432         if self.__inputMagneticField is not None:
0433             if self.__inputMagneticField == 3.8:
0434                 return "MagneticField"
0435             elif self.__inputMagneticField == 0:
0436                 return "MagneticField_0T"
0437             else:
0438                 raise ValueError("Unknown input magnetic field {}".format(self.__inputMagneticField))
0439 
0440         if self.__predefined:
0441             with open(self.__filename) as f:
0442                 datatype = None
0443                 Bfield = None
0444                 for line in f.readlines():
0445                     if line.startswith("#data type: "):
0446                         if datatype is not None:
0447                             raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
0448                         datatype = line.replace("#data type: ", "").replace("\n","")
0449                         datatype = datatype.split("#")[0].strip()
0450                     if line.startswith("#magnetic field: "):
0451                         if Bfield is not None:
0452                             raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
0453                         Bfield = line.replace("#magnetic field: ", "").replace("\n","")
0454                         Bfield = Bfield.split("#")[0].strip()
0455                 if Bfield is not None:
0456                     Bfield = Bfield.split(",")[0]
0457                     if Bfield in Bfieldlist or Bfield == "unknown":
0458                         return Bfield
0459                     else:
0460                         print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
0461                         print("Using Bfield='unknown' - this will revert to the default")
0462                         return "unknown"
0463                 elif datatype == "data":
0464                     return "MagneticField"           #this should be in the "#magnetic field" line, but for safety in case it got messed up
0465                 else:
0466                     return "unknown"
0467 
0468         if self.__dataType == "data":
0469             return "MagneticField"
0470 
0471         #try to find the magnetic field from DAS
0472         #it seems to be there for the newer (7X) MC samples, except cosmics
0473         dasQuery_B = ('dataset dataset=%s instance=%s'%(self.__name, self.__dasinstance))
0474         data = self.__getData( dasQuery_B )
0475 
0476         try:
0477             Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])
0478             if Bfield in Bfieldlist:
0479                 return Bfield
0480             elif Bfield == "38T" or Bfield == "38T_PostLS1":
0481                 return "MagneticField"
0482             elif "MagneticField_" + Bfield in Bfieldlist:
0483                 return "MagneticField_" + Bfield
0484             elif Bfield == "":
0485                 pass
0486             else:
0487                 print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
0488                 print("Using Bfield='unknown' - this will revert to the default magnetic field")
0489                 return "unknown"
0490         except KeyError:
0491             pass
0492 
0493         for possibleB in Bfieldlist:
0494             if (possibleB != "MagneticField"
0495               and possibleB.replace("MagneticField_","") in self.__name.replace("TkAlCosmics0T", "")):
0496                 #final attempt - try to identify the dataset from the name
0497                 #all cosmics dataset names contain "TkAlCosmics0T"
0498                 if possibleB == "MagneticField_38T" or possibleB == "MagneticField_38T_PostLS1":
0499                     return "MagneticField"
0500                 return possibleB
0501 
0502         return "unknown"
0503 
0504     def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):
0505         """For MC, this returns the same as the previous function.
0506            For data, it gets the magnetic field from the runs.  This is important for
0507            deciding which template to use for offlinevalidation
0508         """
0509         if self.__dataType == "mc" and self.__magneticField == "MagneticField":
0510             return 3.8                                        #For 3.8T MC the default MagneticField is used
0511         if self.__inputMagneticField is not None:
0512             return self.__inputMagneticField
0513         if "T" in self.__magneticField:
0514             Bfield = self.__magneticField.split("T")[0].replace("MagneticField_","")
0515             try:
0516                 return float(Bfield) / 10.0                       #e.g. 38T and 38T_PostLS1 both return 3.8
0517             except ValueError:
0518                 pass
0519         if self.__predefined:
0520             with open(self.__filename) as f:
0521                 Bfield = None
0522                 for line in f.readlines():
0523                     if line.startswith("#magnetic field: ") and "," in line:
0524                         if Bfield is not None:
0525                             raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
0526                         return float(line.replace("#magnetic field: ", "").split(",")[1].split("#")[0].strip())
0527 
0528         if run > 0:
0529             dasQuery = ('run=%s instance=%s detail=true'%(run, self.__dasinstance))   #for data
0530             data = self.__getData(dasQuery)
0531             try:
0532                 return self.__findInJson(data, ["run","bfield"])
0533             except KeyError:
0534                 return "unknown Can't get the magnetic field for run %s from DAS" % run
0535 
0536         #run < 0 - find B field for the first and last runs, and make sure they're compatible
0537         #  (to within tolerance)
0538         #NOT FOOLPROOF!  The magnetic field might go up and then down, or vice versa
0539         if self.__firstusedrun is None or self.__lastusedrun is None:
0540             return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."
0541         firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)
0542         lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)
0543         try:
0544             if abs(firstrunB - lastrunB) <= tolerance:
0545                 return .5*(firstrunB + lastrunB)
0546             print(firstrunB, lastrunB, tolerance)
0547             return ("unknown The beginning and end of your run range for %s\n"
0548                     "have different magnetic fields (%s, %s)!\n"
0549                     "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"
0550                     "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)
0551         except TypeError:
0552             try:
0553                 if "unknown" in firstrunB:
0554                     return firstrunB
0555                 else:
0556                     return lastrunB
0557             except TypeError:
0558                 return lastrunB
0559 
0560     @cache
0561     def __getFileInfoList( self, dasLimit, parent = False ):
0562         if self.__predefined:
0563             if parent:
0564                 extendstring = "secFiles.extend"
0565             else:
0566                 extendstring = "readFiles.extend"
0567             with open(self.__fileName) as f:
0568                 files = []
0569                 copy = False
0570                 for line in f.readlines():
0571                     if "]" in line:
0572                         copy = False
0573                     if copy:
0574                         files.append({name: line.translate(None, "', " + '"')})
0575                     if extendstring in line and "[" in line and "]" not in line:
0576                         copy = True
0577             return files
0578 
0579         if parent:
0580             searchdataset = self.parentDataset()
0581         else:
0582             searchdataset = self.__name
0583         dasQuery_files = ( 'file dataset=%s instance=%s detail=true | grep file.name, file.nevents, '
0584                            'file.creation_time, '
0585                            'file.modification_time'%( searchdataset, self.__dasinstance ) )
0586         print("Requesting file information for '%s' from DAS..."%( searchdataset ), end=' ')
0587         sys.stdout.flush()
0588         data = self.__getData( dasQuery_files, dasLimit )
0589         print("Done.")
0590         data = [ self.__findInJson(entry,"file") for entry in data ]
0591         if len( data ) == 0:
0592             msg = ("No files are available for the dataset '%s'. This can be "
0593                    "due to a typo or due to a DAS problem. Please check the "
0594                    "spelling of the dataset and/or retry to run "
0595                    "'validateAlignments.py'."%( self.name() ))
0596             raise AllInOneError( msg )
0597         fileInformationList = []
0598         for file in data:
0599             fileName = 'unknown'
0600             try:
0601                 fileName = self.__findInJson(file, "name")
0602                 fileCreationTime = self.__findInJson(file, "creation_time")
0603                 fileNEvents = self.__findInJson(file, "nevents")
0604             except KeyError:
0605                 print(("DAS query gives bad output for file '%s'.  Skipping it.\n"
0606                        "It may work if you try again later.") % fileName)
0607                 fileNEvents = 0
0608             # select only non-empty files
0609             if fileNEvents == 0:
0610                 continue
0611             fileDict = { "name": fileName,
0612                          "creation_time": fileCreationTime,
0613                          "nevents": fileNEvents
0614                          }
0615             fileInformationList.append( fileDict )
0616         fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )
0617         return fileInformationList
0618 
0619     @cache
0620     def __getRunList( self ):
0621         dasQuery_runs = ( 'run dataset=%s instance=%s | grep run.run_number,'
0622                           'run.creation_time'%( self.__name, self.__dasinstance ) )
0623         print("Requesting run information for '%s' from DAS..."%( self.__name ), end=' ')
0624         sys.stdout.flush()
0625         data = self.__getData( dasQuery_runs )
0626         print("Done.")
0627         data = [ self.__findInJson(entry,"run") for entry in data ]
0628         data.sort( key = lambda run: self.__findInJson(run, "run_number") )
0629         return data
0630 
0631     def __datetime(self, stringForDas):
0632         if len(stringForDas) != 8:
0633             raise AllInOneError(stringForDas + " is not a valid date string.\n"
0634                               + "DAS accepts dates in the form 'yyyymmdd'")
0635         year = stringForDas[:4]
0636         month = stringForDas[4:6]
0637         day = stringForDas[6:8]
0638         return datetime.date(int(year), int(month), int(day))
0639 
0640     def __dateString(self, date):
0641         return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)
0642 
0643     def convertTimeToRun( self, begin = None, end = None,
0644                           firstRun = None, lastRun = None,
0645                           shortTuple = True ):
0646         if ( begin and firstRun ) or ( end and lastRun ):
0647             msg = ( "The Usage of "
0648                     + "'begin' & 'firstRun' " * int( bool( begin and
0649                                                            firstRun ) )
0650                     + "and " * int( bool( ( begin and firstRun ) and
0651                                          ( end and lastRun ) ) )
0652                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
0653                     + "is ambigous." )
0654             raise AllInOneError( msg )
0655 
0656         if begin or end:
0657             runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]
0658 
0659         if begin:
0660             lastdate = begin
0661             for delta in [ 1, 5, 10, 20, 30 ]:                       #try searching for about 2 months after begin
0662                 firstdate = lastdate
0663                 lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))
0664                 dasQuery_begin = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
0665                 begindata = self.__getData(dasQuery_begin)
0666                 if len(begindata) > 0:
0667                     begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
0668                     try:
0669                         runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))
0670                     except ValueError:
0671                         msg = ( "Your 'begin' is after the creation time of the last "
0672                                 "run in the dataset\n'%s'"%( self.__name ) )
0673                         raise AllInOneError( msg )
0674                     firstRun = runList[runIndex]
0675                     begin = None
0676                     break
0677 
0678         if begin:
0679             raise AllInOneError("No runs within a reasonable time interval after your 'begin'."
0680                                 "Try using a 'begin' that has runs soon after it (within 2 months at most)")
0681 
0682         if end:
0683             firstdate = end
0684             for delta in [ 1, 5, 10, 20, 30 ]:                       #try searching for about 2 months before end
0685                 lastdate = firstdate
0686                 firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))
0687                 dasQuery_end = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
0688                 enddata = self.__getData(dasQuery_end)
0689                 if len(enddata) > 0:
0690                     enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
0691                     try:
0692                         runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))
0693                     except ValueError:
0694                         msg = ( "Your 'end' is before the creation time of the first "
0695                                 "run in the dataset\n'%s'"%( self.__name ) )
0696                         raise AllInOneError( msg )
0697                     lastRun = runList[runIndex]
0698                     end = None
0699                     break
0700 
0701         if end:
0702             raise AllInOneError("No runs within a reasonable time interval before your 'end'."
0703                                 "Try using an 'end' that has runs soon before it (within 2 months at most)")
0704 
0705         if shortTuple:
0706             return firstRun, lastRun
0707         else:
0708             return begin, end, firstRun, lastRun
0709 
0710     def dataType( self ):
0711         if not self.__dataType:
0712             self.__dataType = self.__getDataType()
0713         return self.__dataType
0714 
0715     def magneticField( self ):
0716         if not self.__magneticField:
0717             self.__magneticField = self.__getMagneticField()
0718         return self.__magneticField
0719 
0720     def magneticFieldForRun( self, run = -1 ):
0721         return self.__getMagneticFieldForRun(run)
0722 
0723     def parentDataset( self ):
0724         if not self.__parentDataset:
0725             self.__parentDataset = self.__getParentDataset()
0726         return self.__parentDataset
0727 
0728     def datasetSnippet( self, jsonPath = None, begin = None, end = None,
0729                         firstRun = None, lastRun = None, crab = False, parent = False ):
0730         if not firstRun: firstRun = None
0731         if not lastRun: lastRun = None
0732         if not begin: begin = None
0733         if not end: end = None
0734         if self.__predefined and (jsonPath or begin or end or firstRun or lastRun):
0735             msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun' "
0736                     "only work for official datasets, not predefined _cff.py files" )
0737             raise AllInOneError( msg )
0738         if self.__predefined and parent:
0739             with open(self.__filename) as f:
0740                 if "secFiles.extend" not in f.read():
0741                     msg = ("The predefined dataset '%s' does not contain secondary files, "
0742                            "which your validation requires!") % self.__name
0743                     if self.__official:
0744                         self.__name = self.__origName
0745                         self.__predefined = False
0746                         print(msg)
0747                         print ("Retreiving the files from DAS.  You will be asked if you want "
0748                                "to overwrite the old dataset.\n"
0749                                "It will still be compatible with validations that don't need secondary files.")
0750                     else:
0751                         raise AllInOneError(msg)
0752 
0753         if self.__predefined:
0754             snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
0755                        "process.maxEvents = cms.untracked.PSet(\n"
0756                        "    input = cms.untracked.int32(int(.oO[nEvents]Oo. / .oO[parallelJobs]Oo.))\n"
0757                        ")\n"
0758                        "process.source.skipEvents=cms.untracked.uint32(int(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.))"
0759                        %(self.__name))
0760             if not parent:
0761                 with open(self.__filename) as f:
0762                     if "secFiles.extend" in f.read():
0763                         snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"
0764             return snippet
0765         theMap = { "process": "process.",
0766                    "tab": " " * len( "process." ),
0767                    "nEvents": ".oO[nEvents]Oo. / .oO[parallelJobs]Oo.",
0768                    "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(int(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.))\n",
0769                    "importCms": "",
0770                    "header": ""
0771                    }
0772         datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
0773                                                begin = begin,
0774                                                end = end,
0775                                                firstRun = firstRun,
0776                                                lastRun = lastRun,
0777                                                repMap = theMap,
0778                                                crab = crab,
0779                                                parent = parent )
0780         if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":
0781             try:
0782                 self.dump_cff(parent = parent)
0783             except AllInOneError as e:
0784                 print("Can't store the dataset as a cff:")
0785                 print(e)
0786                 print("This may be inconvenient in the future, but will not cause a problem for this validation.")
0787         return datasetSnippet
0788 
0789     @cache
0790     def dump_cff( self, outName = None, jsonPath = None, begin = None,
0791                   end = None, firstRun = None, lastRun = None, parent = False ):
0792         if outName == None:
0793             outName = "Dataset" + self.__name.replace("/", "_")
0794         packageName = os.path.join( "Alignment", "OfflineValidation" )
0795         if not os.path.exists( os.path.join(
0796             self.__cmssw, "src", packageName ) ):
0797             msg = ("You try to store the predefined dataset'%s'.\n"
0798                    "For that you need to check out the package '%s' to your "
0799                    "private relase area in\n"%( outName, packageName )
0800                    + self.__cmssw )
0801             raise AllInOneError( msg )
0802         theMap = { "process": "",
0803                    "tab": "",
0804                    "nEvents": str( -1 ),
0805                    "skipEventsString": "",
0806                    "importCms": "import FWCore.ParameterSet.Config as cms\n",
0807                    "header": "#Do not delete or (unless you know what you're doing) change these comments\n"
0808                              "#%(name)s\n"
0809                              "#data type: %(dataType)s\n"
0810                              "#magnetic field: .oO[magneticField]Oo.\n"    #put in magnetic field later
0811                              %{"name": self.__name,                        #need to create the snippet before getting the magnetic field
0812                                "dataType": self.__dataType}                #so that we know the first and last runs
0813                    }
0814         dataset_cff = self.__createSnippet( jsonPath = jsonPath,
0815                                             begin = begin,
0816                                             end = end,
0817                                             firstRun = firstRun,
0818                                             lastRun = lastRun,
0819                                             repMap = theMap,
0820                                             parent = parent)
0821         magneticField = self.__magneticField
0822         if magneticField == "MagneticField":
0823             magneticField = "%s, %s     #%s" % (magneticField,
0824                                                 str(self.__getMagneticFieldForRun()).replace("\n"," ").split("#")[0].strip(),
0825                                                 "Use MagneticField_cff.py; the number is for determining which track selection to use."
0826                                                )
0827         dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)
0828         filePath = os.path.join( self.__cmssw, "src", packageName,
0829                                  "python", outName + "_cff.py" )
0830         if os.path.exists( filePath ):
0831             existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
0832             askString = "Do you want to overwrite it? [y/n]\n"
0833             inputQuery = existMsg + askString
0834             while True:
0835                 userInput = raw_input( inputQuery ).lower()
0836                 if userInput == "y":
0837                     break
0838                 elif userInput == "n":
0839                     return
0840                 else:
0841                     inputQuery = askString
0842         print ( "The predefined dataset '%s' will be stored in the file\n"
0843                 %( outName )
0844                 + filePath +
0845                 "\nFor future use you have to do 'scram b'." )
0846         print()
0847         theFile = open( filePath, "w" )
0848         theFile.write( dataset_cff )
0849         theFile.close()
0850         return
0851 
0852     def createdatasetfile_hippy(self, filename, filesperjob, firstrun, lastrun):
0853         with open(filename, "w") as f:
0854             for job in self.__chunks(self.fileList(firstRun=firstrun, lastRun=lastrun, forcerunselection=True), filesperjob):
0855                 f.write(",".join("'{}'".format(file) for file in job)+"\n")
0856 
0857     @staticmethod
0858     def getrunnumberfromfilename(filename):
0859         parts = filename.split("/")
0860         result = error = None
0861         if parts[0] != "" or parts[1] != "store":
0862             error = "does not start with /store"
0863         elif parts[2] in ["mc", "relval"]:
0864             result = 1
0865         elif not parts[-1].endswith(".root"):
0866             error = "does not end with something.root"
0867         elif len(parts) != 12:
0868             error = "should be exactly 11 slashes counting the first one"
0869         else:
0870             runnumberparts = parts[-5:-2]
0871             if not all(len(part)==3 for part in runnumberparts):
0872                 error = "the 3 directories {} do not have length 3 each".format("/".join(runnumberparts))
0873             try:
0874                 result = int("".join(runnumberparts))
0875             except ValueError:
0876                 error = "the 3 directories {} do not form an integer".format("/".join(runnumberparts))
0877 
0878         if error:
0879             error = "could not figure out which run number this file is from:\n{}\n{}".format(filename, error)
0880             raise AllInOneError(error)
0881 
0882         return result
0883 
0884     @cache
0885     def fileList(self, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
0886         fileList = [ self.__findInJson(fileInfo,"name")
0887                      for fileInfo in self.fileInfoList(parent) ]
0888 
0889         if firstRun or lastRun:
0890             if not firstRun: firstRun = -1
0891             if not lastRun: lastRun = float('infinity')
0892             unknownfilenames, reasons = [], set()
0893             for filename in fileList[:]:
0894                 try:
0895                     if not firstRun <= self.getrunnumberfromfilename(filename) <= lastRun:
0896                         fileList.remove(filename)
0897                 except AllInOneError as e:
0898                     if forcerunselection: raise
0899                     unknownfilenames.append(e.message.split("\n")[1])
0900                     reasons         .add   (e.message.split("\n")[2])
0901             if reasons:
0902                 if len(unknownfilenames) == len(fileList):
0903                     print("Could not figure out the run numbers of any of the filenames for the following reason(s):")
0904                 else:
0905                     print("Could not figure out the run numbers of the following filenames:")
0906                     for filename in unknownfilenames:
0907                         print("    "+filename)
0908                     print("for the following reason(s):")
0909                 for reason in reasons:
0910                     print("    "+reason)
0911                 print("Using the files anyway.  The runs will be filtered at the CMSSW level.")
0912         return fileList
0913 
0914     def fileInfoList( self, parent = False ):
0915         return self.__getFileInfoList( self.__dasLimit, parent )
0916 
0917     def name( self ):
0918         return self.__name
0919 
0920     def predefined( self ):
0921         return self.__predefined
0922 
0923     @cache
0924     def runList( self ):
0925         return self.__getRunList()
0926 
0927 
0928 if __name__ == '__main__':
0929     print("Start testing...")
0930     datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
0931     jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
0932                  'Collisions12/8TeV/Prompt/'
0933                  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
0934     dataset = Dataset( datasetName )
0935     print(dataset.datasetSnippet( jsonPath = jsonFile,
0936                                   firstRun = "207800",
0937                                   end = "20121128"))
0938     dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
0939                       jsonPath = jsonFile,
0940                       firstRun = "207800",
0941                       end = "20121128" )