Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
import re

# Various regular expressions used to check filename validity:

# Regexp for valid dataset names.
RXDATASET = re.compile(r"^(/[-A-Za-z0-9_]+){3}$")
# Regexp for valid RelVal dataset names.
RXRELVALMC = re.compile(r"^/RelVal[^/]+/(CMSSW(?:_[0-9]+)+(?:_pre[0-9]+)?)[-_].*$")
RXRELVALRUNDEPMC = re.compile(r"^/RelVal[^/]+/(CMSSW(?:_[0-9]+)+(?:_pre[0-9]+)?)[-_].*rundepMC.*$")
RXRELVALDATA = re.compile(r"^/[^/]+/(CMSSW(?:_[0-9]+)+(?:_pre[0-9]+)?)[-_].*$")
RXRUNDEPMC = re.compile(r"^/(?!RelVal)[^/]+/.*rundepMC.*$")

# Regexp for online DQM files.
RXONLINE = re.compile(r"^(?:.*/)?DQM_V(\d+)(_[A-Za-z0-9]+)?_R(\d+)\.root$")

# Regexp for offline DQM files.
RXOFFLINE = re.compile(r"^(?:.*/)?DQM_V(\d+)_R(\d+)((?:__[-A-Za-z0-9_]+){3})\.root$")

# --------------------------------------------------------------------
# Pre-classify a file into main category based on file name structure.
#   path: path (relative to the uploads dir, coming from the walk) of the root
#         file
# Returns a tuple of:
#   a boolean: True or False depending on whether the classification went OK
#   a string or dictionary:
#       - In case the classification went wrong: A string with the reason
#       - In case the classification was OK: A dictionary with classification
#                                            information
def classifyDQMFile(path):
  print(path)
  try:
    m = re.match(RXONLINE, path)
    if m:
      version = int(m.group(1))
      runnr = int(m.group(3))
      subsys = m.group(2) and m.group(2)[1:]
      if version != 1:
        return False, "file version is not 1"
      elif runnr <= 10000:
        return False, "online file has run number <= 10000"
      else:
        # online_data
        return True, { 'class': 'online_data', 'version': version,
                       'subsystem': subsys, 'runnr': runnr,
                       'dataset': "/Global/Online/ALL" }

    m = re.match(RXOFFLINE, path)
    if m:
      version = int(m.group(1))
      dataset = m.group(3).replace("__", "/")
      if not re.match(RXDATASET, dataset):
        return False, "Invalid dataset name"
      relvalmc = re.match(RXRELVALMC, dataset)
      relvaldata = re.match(RXRELVALDATA, dataset)
      relvalrundepmc = re.match(RXRELVALRUNDEPMC, dataset)
      rundepmc = re.match(RXRUNDEPMC, dataset)
      runnr = int(m.group(2))
      if version != 1:
        return False, "file version is not 1"
      if runnr < 1:
         return False, "file matches offline naming, but run number is < 1"
      elif rundepmc:
        if runnr == 1:
          return False,  "file matches Run Dependent MonteCarlo naming, but run number is 1"
        else:
          # simulated_rundep
          return True, { 'class': 'simulated_rundep', 'version': version,
                         'runnr': runnr, 'dataset': dataset }
      elif relvalrundepmc:
        if runnr == 1:
          return False, "file matches Run Dependent MonteCarlo naming, but run number is 1"
        else:
          # relval_rundepmc
          return True, { 'class': 'relval_rundepmc', 'version': version,
                         'runnr': runnr, 'dataset': dataset,
                         'release': relvalrundepmc.group(1)}
      elif relvalmc:
        if runnr != 1:
          return False, "file matches relval mc naming, but run number != 1"
        else:
          # relval_mc
          return True, { 'class': 'relval_mc', 'version': version,
                         'runnr': runnr, 'dataset': dataset,
                         'release': relvalmc.group(1) }
      elif relvaldata:
        if runnr == 1:
          return False, "file matches relval data naming, but run number = 1"
        else:
          # relval_data
          return True, { 'class': 'relval_data', 'version': version,
                         'runnr': runnr, 'dataset': dataset,
                         'release': relvaldata.group(1) }
      elif dataset.find("CMSSW") >= 0:
        return False, "non-relval dataset name contains 'CMSSW'"
      elif runnr > 1:
        # offline_data
        return True, { 'class': 'offline_data', 'version': version,
                       'runnr': runnr, 'dataset': dataset }
      else:
        # simulated
        return True, { 'class': 'simulated', 'version': int(m.group(1)),
                       'runnr': runnr, 'dataset': dataset }

    return False, "file matches no known naming convention"
  except:
    return False, "error while classifying file name"