dataset.py

CMSSW/Alignment/CommonAlignment/python/tools/dataset.py

Line Code

Line	Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278	`import abc` `import csv` `import os` `import re` `import Utilities.General.cmssw_das_client as das_client` `from .utilities import cache` `class DatasetError(Exception): pass` `defaultdasinstance = "prod/global"` `class RunRange(object):` `def __init__(self, firstrun, lastrun, runs):` `self.firstrun = firstrun` `self.lastrun = lastrun` `self.runs = runs` `def __contains__(self, run):` `if self.runs and run not in self.runs: return False` `return self.firstrun <= run <= self.lastrun` `def dasquery(dasQuery, dasLimit=0):` `dasData = das_client.get_data(dasQuery, dasLimit)` `if isinstance(dasData, str):` `jsondict = json.loads( dasData )` `else:` `jsondict = dasData` `# Check, if the DAS query fails` `try:` `error = findinjson(jsondict, "data","error")` `except KeyError:` `error = None` `if error or findinjson(jsondict, "status") != 'ok' or "data" not in jsondict:` `try:` `jsonstr = findinjson(jsondict, "reason")` `except KeyError:` `jsonstr = str(jsondict)` `if len(jsonstr) > 10000:` `jsonfile = "das_query_output_%i.txt"` `i = 0` `while os.path.lexists(jsonfile % i):` `i += 1` `jsonfile = jsonfile % i` `theFile = open( jsonfile, "w" )` `theFile.write( jsonstr )` `theFile.close()` `msg = "The DAS query returned an error. The output is very long, and has been stored in:\n" + jsonfile` `else:` `msg = "The DAS query returned a error. Here is the output\n" + jsonstr` `msg += "\nIt's possible that this was a server error. If so, it may work if you try again later"` `raise DatasetError(msg)` `return findinjson(jsondict, "data")` `def getrunnumbersfromfile(filename, trydas=True, allowunknown=False, dasinstance=defaultdasinstance):` `parts = filename.split("/")` `error = None` `if parts[0] != "" or parts[1] != "store":` `error = "does not start with /store"` `elif parts[2] in ["mc", "relval"]:` `return [1]` `elif not parts[-1].endswith(".root"):` `error = "does not end with something.root"` `elif len(parts) != 12:` `error = "should be exactly 11 slashes counting the first one"` `else:` `runnumberparts = parts[-5:-2]` `if not all(len(part)==3 for part in runnumberparts):` `error = "the 3 directories {} do not have length 3 each".format("/".join(runnumberparts))` `try:` `return [int("".join(runnumberparts))]` `except ValueError:` `error = "the 3 directories {} do not form an integer".format("/".join(runnumberparts))` `if error and trydas:` `try:` `query = "run file={} instance={}".format(filename, dasinstance)` `dasoutput = dasquery(query)` `result = findinjson(dasoutput, "run")` `return sum((findinjson(run, "run_number") for run in result), [])` `except Exception as e:` `error = str(e)` `if error and allowunknown:` `return [-1]` `if error:` `error = "could not figure out which run number this file is from.\nMaybe try with allowunknown=True?\n {}\n{}".format(filename, error)` `raise DatasetError(error)` `def findinjson(jsondict, strings):` `if len(strings) == 0:` `return jsondict` `if isinstance(jsondict,dict):` `if strings[0] in jsondict:` `try:` `return findinjson(jsondict[strings[0]], strings[1:])` `except KeyError:` `pass` `else:` `for a in jsondict:` `if strings[0] in a:` `try:` `return findinjson(a[strings[0]], strings[1:])` `except (TypeError, KeyError): #TypeError because a could be a string and contain strings[0]` `pass` `#if it's not found` `raise KeyError("Can't find " + strings[0])` `class DataFile(object):` `def __init__(self, filename, nevents, runs=None, trydas=True, allowunknown=False, dasinstance=defaultdasinstance):` `self.filename = filename` `self.nevents = int(nevents)` `if runs is None:` `runs = getrunnumbersfromfile(filename, trydas=trydas, allowunknown=allowunknown, dasinstance=dasinstance)` `if isinstance(runs, str):` `runs = runs.split()` `self.runs = [int(_) for _ in runs]` `def getdict(self):` `return {"filename": self.filename, "nevents": str(self.nevents), "runs": " ".join(str(_) for _ in self.runs)}` `class DatasetBase(object):` `__metaclass__ = abc.ABCMeta` `@abc.abstractmethod` `def getfiles(self, usecache):` `pass` `@abc.abstractproperty` `def headercomment(self):` `pass` `def writefilelist_validation(self, firstrun, lastrun, runs, maxevents, outputfile=None, usecache=True):` `runrange = RunRange(firstrun=firstrun, lastrun=lastrun, runs=runs)` `if outputfile is None:` `outputfile = os.path.join(os.environ["CMSSW_BASE"], "src", "Alignment", "OfflineValidation", "python", self.filenamebase+"_cff.py")` `if maxevents < 0: maxevents = float("inf")` `totalevents = sum(datafile.nevents for datafile in self.getfiles(usecache) if all(run in runrange for run in datafile.runs))` `if totalevents == 0:` `raise ValueError("No events within the run range!")` `accepted = rejected = 0. #float so fractions are easier` `fractiontoaccept = 1.maxevents / totalevents` `with open(outputfile, "w") as f:` `f.write("#"+self.headercomment+"\n")` `f.write(validationheader)` `for datafile in self.getfiles(usecache):` `if all(run in runrange for run in datafile.runs):` `if accepted == 0 or accepted / (accepted+rejected) <= fractiontoaccept:` `f.write('"' + datafile.filename + '",\n')` `accepted += datafile.nevents` `else:` `rejected += datafile.nevents` `elif any(run in runrange for run in datafile.runs):` `raise DatasetError("file {} has multiple runs {}, which straddle firstrun or lastrun".format(datafile.filename, datafile.runs))` `f.write("#total events in these files: {}".format(accepted))` `f.write(validationfooter)` `def writefilelist_hippy(self, firstrun, lastrun, runs, eventsperjob, maxevents, outputfile, usecache=True):` `runrange = RunRange(firstrun=firstrun, lastrun=lastrun, runs=runs)` `if maxevents < 0: maxevents = float("inf")` `totalevents = sum(datafile.nevents for datafile in self.getfiles(usecache) if all(run in runrange for run in datafile.runs))` `if totalevents == 0:` `raise ValueError("No events within the run range!")` `accepted = rejected = inthisjob = 0. #float so fractions are easier` `fractiontoaccept = 1.maxevents / totalevents` `writecomma = False` `with open(outputfile, "w") as f:` `for datafile in self.getfiles(usecache):` `if all(run in runrange for run in datafile.runs):` `if accepted == 0 or accepted / (accepted+rejected) <= fractiontoaccept:` `if writecomma: f.write(",")` `f.write("'" + datafile.filename + "'")` `accepted += datafile.nevents` `inthisjob += datafile.nevents` `if inthisjob >= eventsperjob:` `f.write("\n")` `inthisjob = 0` `writecomma = False` `else:` `writecomma = True` `else:` `rejected += datafile.nevents` `elif any(run in runrange for run in datafile.runs):` `raise DatasetError("file {} has multiple runs {}, which straddle firstrun or lastrun".format(datafile.filename, datafile.runs))` `f.write("\n")` `class Dataset(DatasetBase):` `def __init__(self, datasetname, dasinstance=defaultdasinstance):` `self.datasetname = datasetname` `if re.match(r'/.+/.+/.+', datasetname):` `self.official = True` `self.filenamebase = "Dataset" + self.datasetname.replace("/","_")` `else:` `self.official = False` `self.filenamebase = datasetname` `self.dasinstance = dasinstance` `@cache` `def getfiles(self, usecache):` `filename = os.path.join(os.environ["CMSSW_BASE"], "src", "Alignment", "CommonAlignment", "data", self.filenamebase+".csv")` `if not usecache:` `try:` `os.remove(filename)` `except OSError as e:` `if os.path.exists(filename):` `raise` `result = []` `try:` `with open(filename) as f:` `for row in csv.DictReader(f):` `result.append(DataFile(row))` `return result` `except IOError:` `pass` `query = "file dataset={} instance={} detail=true \| grep file.name, file.nevents".format(self.datasetname, self.dasinstance)` `dasoutput = dasquery(query)` `if not dasoutput:` `raise DatasetError("No files are available for the dataset '{}'. This can be "` `"due to a typo or due to a DAS problem. Please check the "` `"spelling of the dataset and/or try again.".format(datasetname))` `result = [DataFile(findinjson(_, "file", "name"), findinjson(_, "file", "nevents")) for _ in dasoutput if int(findinjson(_, "file", "nevents"))]` `try:` `with open(filename, "w") as f:` `writer = csv.DictWriter(f, ("filename", "nevents", "runs"))` `writer.writeheader()` `for datafile in result:` `writer.writerow(datafile.getdict())` `except Exception as e:` `print("Couldn't write the dataset csv file:\n\n{}".format(e))` `return result` `@property` `def headercomment(self):` `return self.datasetname` `class MultipleDatasets(DatasetBase):` `def __init__(self, datasets, **kwargs):` `dasinstance = defaultdasinstance` `for kw, kwarg in kwargs.iteritems():` `if kw == "dasinstance":` `dasinstance = kwarg` `else:` `raise TypeError("Unknown kwarg {}={}".format(kw, kwarg))` `self.datasets = [Dataset(dataset, dasinstance=dasinstance) for dataset in datasets]` `@cache` `def getfiles(self, usecache):` `return sum([d.getfiles(usecache=usecache) for d in self.datasets], [])` `@property` `def headercomment(self):` `return ", ".join(d.headercomment for d in self.datasets)` `validationheader = """` `import FWCore.ParameterSet.Config as cms` `maxEvents = cms.untracked.PSet( input = cms.untracked.int32(-1) )` `readFiles = cms.untracked.vstring()` `secFiles = cms.untracked.vstring()` `source = cms.Source ("PoolSource",fileNames = readFiles, secondaryFileNames = secFiles)` `readFiles.extend( [` `"""` `validationfooter = """` `] )` `"""`

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278


import abc
import csv
import os
import re

import Utilities.General.cmssw_das_client as das_client

from .utilities import cache

class DatasetError(Exception): pass

defaultdasinstance = "prod/global"

class RunRange(object):
  def __init__(self, firstrun, lastrun, runs):
    self.firstrun = firstrun
    self.lastrun = lastrun
    self.runs = runs

  def __contains__(self, run):
    if self.runs and run not in self.runs: return False
    return self.firstrun <= run <= self.lastrun

def dasquery(dasQuery, dasLimit=0):
  dasData = das_client.get_data(dasQuery, dasLimit)
  if isinstance(dasData, str):
    jsondict = json.loads( dasData )
  else:
    jsondict = dasData
  # Check, if the DAS query fails
  try:
    error = findinjson(jsondict, "data","error")
  except KeyError:
    error = None
  if error or findinjson(jsondict, "status") != 'ok' or "data" not in jsondict:
    try:
      jsonstr = findinjson(jsondict, "reason")
    except KeyError: 
      jsonstr = str(jsondict)
    if len(jsonstr) > 10000:
      jsonfile = "das_query_output_%i.txt"
      i = 0
      while os.path.lexists(jsonfile % i):
        i += 1
      jsonfile = jsonfile % i
      theFile = open( jsonfile, "w" )
      theFile.write( jsonstr )
      theFile.close()
      msg = "The DAS query returned an error.  The output is very long, and has been stored in:\n" + jsonfile
    else:
      msg = "The DAS query returned a error.  Here is the output\n" + jsonstr
    msg += "\nIt's possible that this was a server error.  If so, it may work if you try again later"
    raise DatasetError(msg)
  return findinjson(jsondict, "data")

def getrunnumbersfromfile(filename, trydas=True, allowunknown=False, dasinstance=defaultdasinstance):
  parts = filename.split("/")
  error = None
  if parts[0] != "" or parts[1] != "store":
    error = "does not start with /store"
  elif parts[2] in ["mc", "relval"]:
    return [1]
  elif not parts[-1].endswith(".root"):
    error = "does not end with something.root"
  elif len(parts) != 12:
    error = "should be exactly 11 slashes counting the first one"
  else:
    runnumberparts = parts[-5:-2]
    if not all(len(part)==3 for part in runnumberparts):
      error = "the 3 directories {} do not have length 3 each".format("/".join(runnumberparts))
    try:
      return [int("".join(runnumberparts))]
    except ValueError:
      error = "the 3 directories {} do not form an integer".format("/".join(runnumberparts))

  if error and trydas:
    try:
      query = "run file={} instance={}".format(filename, dasinstance)
      dasoutput = dasquery(query)
      result = findinjson(dasoutput, "run")
      return sum((findinjson(run, "run_number") for run in result), [])
    except Exception as e:
      error = str(e)

  if error and allowunknown:
    return [-1]

  if error:
    error = "could not figure out which run number this file is from.\nMaybe try with allowunknown=True?\n  {}\n{}".format(filename, error)
    raise DatasetError(error)

def findinjson(jsondict, *strings):
  if len(strings) == 0:
    return jsondict
  if isinstance(jsondict,dict):
    if strings[0] in jsondict:
      try:
        return findinjson(jsondict[strings[0]], *strings[1:])
      except KeyError:
        pass
  else:
    for a in jsondict:
      if strings[0] in a:
        try:
          return findinjson(a[strings[0]], *strings[1:])
        except (TypeError, KeyError):  #TypeError because a could be a string and contain strings[0]
          pass
  #if it's not found
  raise KeyError("Can't find " + strings[0])

class DataFile(object):
  def __init__(self, filename, nevents, runs=None, trydas=True, allowunknown=False, dasinstance=defaultdasinstance):
    self.filename = filename
    self.nevents = int(nevents)
    if runs is None:
      runs = getrunnumbersfromfile(filename, trydas=trydas, allowunknown=allowunknown, dasinstance=dasinstance)
    if isinstance(runs, str):
      runs = runs.split()
    self.runs = [int(_) for _ in runs]

  def getdict(self):
    return {"filename": self.filename, "nevents": str(self.nevents), "runs": " ".join(str(_) for _ in self.runs)}

class DatasetBase(object):
  __metaclass__ = abc.ABCMeta

  @abc.abstractmethod
  def getfiles(self, usecache):
    pass

  @abc.abstractproperty
  def headercomment(self):
    pass

  def writefilelist_validation(self, firstrun, lastrun, runs, maxevents, outputfile=None, usecache=True):
    runrange = RunRange(firstrun=firstrun, lastrun=lastrun, runs=runs)

    if outputfile is None:
      outputfile = os.path.join(os.environ["CMSSW_BASE"], "src", "Alignment", "OfflineValidation", "python", self.filenamebase+"_cff.py")

    if maxevents < 0: maxevents = float("inf")
    totalevents = sum(datafile.nevents for datafile in self.getfiles(usecache) if all(run in runrange for run in datafile.runs))
    if totalevents == 0:
      raise ValueError("No events within the run range!")
    accepted = rejected = 0.  #float so fractions are easier

    fractiontoaccept = 1.*maxevents / totalevents

    with open(outputfile, "w") as f:
      f.write("#"+self.headercomment+"\n")
      f.write(validationheader)
      for datafile in self.getfiles(usecache):
        if all(run in runrange for run in datafile.runs):
          if accepted == 0 or accepted / (accepted+rejected) <= fractiontoaccept:
            f.write('"' + datafile.filename + '",\n')
            accepted += datafile.nevents
          else:
            rejected += datafile.nevents
        elif any(run in runrange for run in datafile.runs):
          raise DatasetError("file {} has multiple runs {}, which straddle firstrun or lastrun".format(datafile.filename, datafile.runs))
      f.write("#total events in these files: {}".format(accepted))
      f.write(validationfooter)

  def writefilelist_hippy(self, firstrun, lastrun, runs, eventsperjob, maxevents, outputfile, usecache=True):
    runrange = RunRange(firstrun=firstrun, lastrun=lastrun, runs=runs)
    if maxevents < 0: maxevents = float("inf")
    totalevents = sum(datafile.nevents for datafile in self.getfiles(usecache) if all(run in runrange for run in datafile.runs))
    if totalevents == 0:
      raise ValueError("No events within the run range!")
    accepted = rejected = inthisjob = 0.  #float so fractions are easier

    fractiontoaccept = 1.*maxevents / totalevents
    writecomma = False

    with open(outputfile, "w") as f:
      for datafile in self.getfiles(usecache):
        if all(run in runrange for run in datafile.runs):
          if accepted == 0 or accepted / (accepted+rejected) <= fractiontoaccept:
            if writecomma: f.write(",")
            f.write("'" + datafile.filename + "'")
            accepted += datafile.nevents
            inthisjob += datafile.nevents
            if inthisjob >= eventsperjob:
              f.write("\n")
              inthisjob = 0
              writecomma = False
            else:
              writecomma = True
          else:
            rejected += datafile.nevents
        elif any(run in runrange for run in datafile.runs):
          raise DatasetError("file {} has multiple runs {}, which straddle firstrun or lastrun".format(datafile.filename, datafile.runs))
      f.write("\n")

class Dataset(DatasetBase):
  def __init__(self, datasetname, dasinstance=defaultdasinstance):
    self.datasetname = datasetname
    if re.match(r'/.+/.+/.+', datasetname):
      self.official = True
      self.filenamebase = "Dataset" + self.datasetname.replace("/","_")
    else:
      self.official = False
      self.filenamebase = datasetname

    self.dasinstance = dasinstance

  @cache
  def getfiles(self, usecache):
    filename = os.path.join(os.environ["CMSSW_BASE"], "src", "Alignment", "CommonAlignment", "data", self.filenamebase+".csv")
    if not usecache:
      try:
        os.remove(filename)
      except OSError as e:
        if os.path.exists(filename):
          raise

    result = []
    try:
      with open(filename) as f:
        for row in csv.DictReader(f):
          result.append(DataFile(**row))
        return result
    except IOError:
      pass

    query = "file dataset={} instance={} detail=true | grep file.name, file.nevents".format(self.datasetname, self.dasinstance)
    dasoutput = dasquery(query)
    if not dasoutput:
      raise DatasetError("No files are available for the dataset '{}'. This can be "
                         "due to a typo or due to a DAS problem. Please check the "
                         "spelling of the dataset and/or try again.".format(datasetname))
    result = [DataFile(findinjson(_, "file", "name"), findinjson(_, "file", "nevents")) for _ in dasoutput if int(findinjson(_, "file", "nevents"))]
    try:
      with open(filename, "w") as f:
        writer = csv.DictWriter(f, ("filename", "nevents", "runs"))
        writer.writeheader()
        for datafile in result:
          writer.writerow(datafile.getdict())
    except Exception as e:
      print("Couldn't write the dataset csv file:\n\n{}".format(e))
    return result

  @property
  def headercomment(self):
    return self.datasetname

class MultipleDatasets(DatasetBase):
  def __init__(self, *datasets, **kwargs):
    dasinstance = defaultdasinstance
    for kw, kwarg in kwargs.iteritems():
      if kw == "dasinstance":
        dasinstance = kwarg
      else:
        raise TypeError("Unknown kwarg {}={}".format(kw, kwarg))
    self.datasets = [Dataset(dataset, dasinstance=dasinstance) for dataset in datasets]

  @cache
  def getfiles(self, usecache):
    return sum([d.getfiles(usecache=usecache) for d in self.datasets], [])

  @property
  def headercomment(self):
    return ", ".join(d.headercomment for d in self.datasets)

validationheader = """
import FWCore.ParameterSet.Config as cms

maxEvents = cms.untracked.PSet( input = cms.untracked.int32(-1) )
readFiles = cms.untracked.vstring()
secFiles = cms.untracked.vstring()
source = cms.Source ("PoolSource",fileNames = readFiles, secondaryFileNames = secFiles)
readFiles.extend( [
"""

validationfooter = """
] )
"""