Back to home page

Project CMSSW displayed by LXR

 
 

    


Warning, /DQMServices/Components/scripts/dqm-grep is written in an unsupported language. File is not indexed.

0001 #!/usr/bin/env python3
0002 
0003 """Usage: dqm-grep -f PATTERN -e EXPRESSION [OPTIONS]
0004 
0005 Grep contents of DQM GUI index for samples matching an expression
0006 and contents whose name matches a wild card pattern.
0007 
0008 The sample expression (-e option) is a boolean expression which should
0009 yield True if the sample should be processed and False otherwise. The
0010 expression may use the terms `run`, `dataset`, `version` and `type`,
0011 and the function `match()` for matching regular expressions. See the
0012 examples below for typical sample expressions.
0013 
0014 The filter pattern (-f option) is a wild card pattern which defines
0015 what to search. It may use wild card '*' to match any string within
0016 one directory level, '**' to match any number of subdirectories, and
0017 '***' to match any subdirectory or monitor element object. Names with
0018 a trailing slash match only directories; names without trailing slash
0019 match only plain objects. The pattern must always start with a slash.
0020 
0021 For example the pattern '/*/EventInfo/' matches all `EventInfo` sub-
0022 directories one level down from the root. The pattern '/*/Ev*/*Summary*'
0023 matches all plain objects whose name contain 'Summary' in subdirectories
0024 starting with 'Ev' one level down from the per-sample root directory.
0025 The pattern '/CSC/***' matches all directories and objects inside the
0026 top-level 'CSC' directory.
0027 
0028 It is important to use sufficiently strict pattern to avoid unnecessary
0029 - and very expensive - traversal of the full index. There can easily be
0030 tens of thousands of directories to traverse per sample, and retrieving
0031 them all can get very expensive.
0032 
0033 Examples of use:
0034 
0035 dqm-grep -f '/CSC/Event*/*Summary*' -e 'match("/StreamHLTMON/Run2011A-Express-v4/DQM", dataset)'
0036 dqm-grep -f '/*/Event*/*Summary*' -e 'run == 168330 and match("/StreamHLTMON/Run2011A-Express-v4/DQM", dataset)'
0037 dqm-grep -f '/*/Event*/***' -e 'run == 168330 and match("/StreamHLTMON/Run2011A-Express-v4/DQM", dataset)'
0038 dqm-grep -f '/C*/***' -e 'run == 168330 and match("/.*/Run2011A-Express-v4/DQM", dataset)'
0039 dqm-grep -f '/*/EventInfo/CertificationSummary' -e 'match("/StreamExpress.*/Run2011A-Express-v4/DQM", dataset)'
0040 
0041 In order to authenticate to the target server, standard grid certificate
0042 environment must be available. Typically this would be X509_CERT_DIR and
0043 either X509_USER_PROXY or X509_USER_CERT and X509_USER_KEY environment
0044 variables. If these variables are not set, the following defaults are
0045 checked for existence. Note that if the script falls back on using a
0046 key rather than a proxy, it will prompt for the key password.
0047 - $X509_CERT_DIR: /etc/grid-security/certificates
0048 - $X509_USER_KEY: $HOME/.globus/userkey.pem
0049 - $X509_USER_CERT: $HOME/.globus/usercert.pem
0050 """
0051 
0052 from DQMServices.Components.HTTP import RequestManager
0053 from DQMServices.Components.X509 import SSLOptions
0054 import sys, re, json, pycurl
0055 from urllib.parse import quote
0056 from optparse import OptionParser
0057 from time import time
0058 
0059 # Object types.
0060 DIR = 0 # Directory
0061 FILE = 1 # File / simple object.
0062 ANY = 2 # Either; used only for filters.
0063 
0064 # HTTP protocol `User-agent` identification string.
0065 ident = "DQMGrep/1.0 python/%s.%s.%s" % sys.version_info[:3]
0066 
0067 # Where to find JSON contents at a given server.
0068 url_content = "/%(section)s/%(run)d%(dataset)s%(path)s"
0069 
0070 # SSL/X509 options.
0071 ssl_opts = None
0072 
0073 # HTTP request manager for content requests.
0074 reqman = None
0075 
0076 # Number of HTTP requests made for content.
0077 nreq = 0
0078 
0079 # Found objects (per sample).
0080 found = []
0081 
0082 class filter:
0083   """One step of a search filter.
0084 
0085 - `type`: the type of object the filter can match: `FILE`, `DIR` or `ANY`.
0086 - `recurse`: apply the pattern recursively to subdirectories if True.
0087 - `pattern`: the regular expression pattern as a string.
0088 - `rx`: the regular expression as a compiled regexp object."""
0089   type = FILE
0090   recurse = False
0091   pattern = ""
0092   rx = None
0093 
0094   def __repr__(self):
0095     return "(filter pattern='%s' type=%s recurse=%s)" \
0096       % (self.pattern, self.type, self.recurse)
0097 
0098 def pattern_to_filter(pattern):
0099   """Converts a search pattern into a search filter.
0100 
0101 The pattern must be of the form of path with '*' wild card pattern, for
0102 example "/*/EventInfo/*Summary". A single star matches any string except
0103 slashes, i.e. matching within a single directory. A double star matches
0104 directories recursively.
0105 
0106 A name with trailing slash matches directory. A name without the trailing
0107 slash matches non-directories. A triple star will match either directory
0108 or non-directory.
0109 
0110 The patterns match against the full path, and therefore must always start
0111 with a slash. If you want to search entire tree, use "/**/Name*".
0112 
0113 The pattern "/*/EventInfo/" matches folders named 'EventInfo' one level
0114 down from the top; it will not recurse further down in the tree as it is
0115 known matches deeper inside are not possible. "/*/EventInfo/**/*Summary"
0116 pattern matches any non-directory object name ending in "Summary" anywhere
0117 inside "EventInfo" one level down from the top.
0118 
0119 Returns a list of `filter` expressions representing the pattern, each of
0120 which represents one or more levels of matching/recursion.
0121 """
0122   filters = []
0123 
0124   # Check the pattern starts with '/'
0125   if not pattern.startswith("/"):
0126     raise ValueError("pattern must start with slash")
0127 
0128   # Process pattern as directory search specs, but collapse
0129   # repeated slashes into one slash first.
0130   for part in re.sub("/+", "/", pattern).split('/')[1:]:
0131     if filters and filters[-1].type == FILE:
0132       filters[-1].type = DIR
0133     f = filter()
0134     filters.append(f)
0135     for term in re.split("([*]+)", part):
0136       if term == "***":
0137         f.pattern += ".*"
0138         f.recurse = True
0139         f.type = ANY
0140       elif term == "**":
0141         f.pattern += ".*"
0142         f.recurse = True
0143         f.type = DIR
0144       elif term == "*":
0145         f.pattern += "[^/]*"
0146         f.type = FILE
0147       elif term:
0148         f.pattern += re.escape(term)
0149         f.type = FILE
0150     if f.pattern != ".*":
0151       f.pattern = "^%s$" % f.pattern
0152     f.rx = re.compile(f.pattern)
0153 
0154   return filters
0155 
0156 def should_process_sample(s, expr):
0157   """Evaluate sample predicate expression `expr` against sample `s`.
0158 Returns True if the sample should be processed, False otherwise."""
0159   try:
0160     s['match'] = lambda rx, str: re.match(rx, str)
0161     s['run'] = int(s['run'])
0162     val = eval(expr, {}, s)
0163     del s['match']
0164     return val
0165   except:
0166     return False
0167 
0168 def find_matching_samples(options):
0169   """Generator which returns all samples at target sever which
0170 match the requested predicate expression."""
0171   all_samples = {}
0172 
0173   def req_error(c, url, errmsg, errno):
0174     print("%s: failed to retrieve samples: %s (%d)" \
0175       % (options.server, errmsg, errno), file=sys.stderr)
0176     sys.exit(1)
0177 
0178   def req_done(c):
0179     json_decoder = json.decoder.JSONDecoder()
0180     all_samples['result'] = json_decoder.decode(c.buffer.getvalue().decode('utf-8'))
0181 
0182   reqman = RequestManager(ssl_opts = ssl_opts,
0183                           user_agent = ident,
0184                           request_respond = req_done,
0185                           request_error = req_error)
0186   reqman.put((options.server + "/samples",))
0187   reqman.process()
0188 
0189   if not all_samples:
0190     print("%s: no samples" % options.server, file=sys.stderr)
0191     sys.exit(1)
0192 
0193   for sample_type in all_samples['result']['samples']:
0194     for sample in sample_type['items']:
0195       if should_process_sample(sample, options.sample_expr):
0196         yield sample
0197 
0198 def request_init(c, options, sample, filters, pos, path):
0199   """`RequestManager` callback to initialise JSON contents request."""
0200   sample.update(path = path)
0201   c.url = options.server + quote(url_content % sample)
0202   c.setopt(pycurl.URL, c.url)
0203   if False and options.verbose:
0204     print(c.url)
0205 
0206 def report_error(c, task, errmsg, errno):
0207   """`RequestManager` callback to report JSON contents request errors."""
0208   print("FAILED to retrieve %s: %s (%d)" % (task, errmsg, errno), file=sys.stderr)
0209 
0210 def match_filters(item, filters, poslist):
0211   """Match filter list created by `pattern_to_filter` against an object.
0212 
0213 The input arguments are:
0214 - `item`: JSON for the object from the server.
0215 - `filters`: List of all filters; not modified in any way.
0216 - `poslist`: The list of positions in `filters` where to search.
0217 
0218 The searching initially begins with `poslist` equal to [0], i.e. the first
0219 filter. The function builds a new poslist to use for subdirectories. For
0220 each non-recursive filter, the old filter is effectively removed from the
0221 list and the next filter (if any) is added back. Recursive filters stay in
0222 the list, so effectively the `poslist` maintains a NFA search stack for all
0223 active search positions.
0224 
0225 The function returns a tuple consisting of:
0226 - `name`: the name of the object that was matched, subdirectory or plain
0227 - `matched`: True if the entire filter chain has matched for this object
0228 - `descend`: True if this was a subdirectory the filters require to descend
0229 into; note that this is different from `matched`, basically non-terminal
0230 match on subdirectory objects
0231 - `poslist`: new filter position list for searching any subdirectories;
0232 this will be empty if the filter list has been exhausted with or without
0233 match.
0234 """
0235   newposlist = []
0236   descend = False
0237   matched = False
0238   name = None
0239 
0240   for idx in poslist:
0241     assert idx < len(filters)
0242     f = filters[idx]
0243     fmatched = False
0244     if 'subdir' in item \
0245        and (f.type == DIR or f.type == ANY) \
0246        and f.rx.match(item['subdir']):
0247       descend = fmatched = True
0248       name = item['subdir']
0249     elif 'obj' in item \
0250          and (f.type == FILE or f.type == ANY) \
0251          and f.rx.match(item['obj']):
0252       fmatched = True
0253       name = item['obj']
0254 
0255     if fmatched:
0256       if idx == len(filters)-1:
0257         matched = True
0258       if f.recurse:
0259         newposlist.append(idx)
0260       if idx < len(filters) - 1:
0261         newposlist.append(idx+1)
0262 
0263   return name, matched, descend, newposlist
0264 
0265 def process(c):
0266   """`RequestManager` callback to handle JSON content response.
0267 
0268 This gets called once per every directory which has been successfully
0269 retrieved from the server. It basically applies `match_filters` to all
0270 objects found and requests subdirectories if necessary, and adds to
0271 `found` objects which matched the entire filter expression.
0272 
0273 If verbosity has been requested, also shows simple progress bar on the
0274 search progress, one dot for every ten directories retrieved."""
0275   global found, nreq
0276   options, sample, filters, pos, path = c.task
0277   json_decoder = json.decoder.JSONDecoder()
0278 
0279   nreq += 1
0280   if options.verbose and nreq % 10 == 0:
0281     sys.stdout.write(".")
0282     sys.stdout.flush()
0283     if nreq % 750 == 0:
0284       print()
0285 
0286   reply = c.buffer.getvalue().decode('utf-8')
0287   reply = re.sub(r'("value": ")"([A-Za-z0-9_]+")"', r'\1\2', reply)
0288   reply = re.sub(r'("(?:mean|rms|min|max)":) nan,', r'\1 "NaN",', reply)
0289   reply = json_decoder.decode(reply)
0290 
0291   seen = set()
0292   for item in reply['contents']:
0293     name, match, descend, newpos = match_filters(item, filters, pos)
0294     if match:
0295       found.append((path + name, item))
0296     if descend and name not in seen:
0297       reqman.put((options, sample, filters, newpos, path + name + "/"))
0298     seen.update((name,))
0299 
0300 # Parse command line options.
0301 op = OptionParser(usage = __doc__)
0302 op.add_option("-e", "--samples", dest = "sample_expr", metavar = "EXPRESSION",
0303               help = "Evaluate EXPRESSION to decide which samples to scan")
0304 op.add_option("-f", "--filter", dest = "glob",
0305               type = "string", action = "store", metavar = "PATTERN",
0306               default = "/*/EventInfo/*Summary",
0307               help = "Filter monitor elements matching PATTERN")
0308 op.add_option("-s", "--server", dest = "server",
0309               type = "string", action = "store", metavar = "SERVER",
0310               default = "https://cmsweb.cern.ch/dqm/offline/data/json",
0311               help = "Pull content from SERVER")
0312 op.add_option("-n", "--connections", dest = "connections",
0313               type = "int", action = "store", metavar = "NUM",
0314               default = 10, help = "Use NUM concurrent connections")
0315 op.add_option("-v", "--verbose", dest = "verbose",
0316               action = "store_true", default = False,
0317               help = "Show verbose scan information")
0318 options, args = op.parse_args()
0319 if args:
0320   print("Too many arguments", file=sys.stderr)
0321   sys.exit(1)
0322 if not options.sample_expr:
0323   print("Sample predicate expression required", file=sys.stderr)
0324   sys.exit(1)
0325 if not options.glob:
0326   print("Monitor element filter expression required", file=sys.stderr)
0327   sys.exit(1)
0328 if not options.server:
0329   print("Server contact string required", file=sys.stderr)
0330   sys.exit(1)
0331 
0332 # Get SSL X509 parametres.
0333 ssl_opts = SSLOptions()
0334 if options.verbose:
0335   print("Using SSL cert dir", ssl_opts.ca_path)
0336   print("Using SSL private key", ssl_opts.key_file)
0337   print("Using SSL public key", ssl_opts.cert_file)
0338 
0339 # Convert glob pattern into a filter expression.
0340 filters = pattern_to_filter(options.glob)
0341 
0342 # Start a request manager for contents.
0343 reqman = RequestManager(num_connections = options.connections,
0344                         ssl_opts = ssl_opts,
0345                         user_agent = ident,
0346                         request_init = request_init,
0347                         request_respond = process,
0348                         request_error = report_error)
0349 
0350 # Process all samples matching the predicate.
0351 ntotreq = 0
0352 nfound = 0
0353 start = time()
0354 for sample in find_matching_samples(options):
0355   nreq = 0
0356   found = []
0357   sample['section'] = 'archive'
0358   if options.verbose:
0359     print("Scanning %s" % sample)
0360   reqman.put((options, sample, filters, [0], "/"))
0361   reqman.process()
0362   if options.verbose:
0363     print()
0364   if found:
0365     print("%(section)s/%(run)d%(dataset)s:" % sample)
0366     found.sort()
0367     for path, item in found:
0368       if 'subdir' in item:
0369         print(" %s/" % path)
0370       elif 'value' in item:
0371         print(" %s = %s" % (path, item['value']))
0372       else:
0373         print(" %s = [%s # %d]" % (path, item['properties']['type'], item['nentries']))
0374   nfound += len(found)
0375   ntotreq += nreq
0376 end = time()
0377 
0378 # Provide final summary.
0379 if options.verbose:
0380   print("\nFound %d objects in %d directories in %.3f seconds" % (nfound, ntotreq, end - start))
0381