Warning, /DQMServices/Components/scripts/dqm-grep is written in an unsupported language. File is not indexed.
0001 #!/usr/bin/env python3
0002
0003 """Usage: dqm-grep -f PATTERN -e EXPRESSION [OPTIONS]
0004
0005 Grep contents of DQM GUI index for samples matching an expression
0006 and contents whose name matches a wild card pattern.
0007
0008 The sample expression (-e option) is a boolean expression which should
0009 yield True if the sample should be processed and False otherwise. The
0010 expression may use the terms `run`, `dataset`, `version` and `type`,
0011 and the function `match()` for matching regular expressions. See the
0012 examples below for typical sample expressions.
0013
0014 The filter pattern (-f option) is a wild card pattern which defines
0015 what to search. It may use wild card '*' to match any string within
0016 one directory level, '**' to match any number of subdirectories, and
0017 '***' to match any subdirectory or monitor element object. Names with
0018 a trailing slash match only directories; names without trailing slash
0019 match only plain objects. The pattern must always start with a slash.
0020
0021 For example the pattern '/*/EventInfo/' matches all `EventInfo` sub-
0022 directories one level down from the root. The pattern '/*/Ev*/*Summary*'
0023 matches all plain objects whose name contain 'Summary' in subdirectories
0024 starting with 'Ev' one level down from the per-sample root directory.
0025 The pattern '/CSC/***' matches all directories and objects inside the
0026 top-level 'CSC' directory.
0027
0028 It is important to use sufficiently strict pattern to avoid unnecessary
0029 - and very expensive - traversal of the full index. There can easily be
0030 tens of thousands of directories to traverse per sample, and retrieving
0031 them all can get very expensive.
0032
0033 Examples of use:
0034
0035 dqm-grep -f '/CSC/Event*/*Summary*' -e 'match("/StreamHLTMON/Run2011A-Express-v4/DQM", dataset)'
0036 dqm-grep -f '/*/Event*/*Summary*' -e 'run == 168330 and match("/StreamHLTMON/Run2011A-Express-v4/DQM", dataset)'
0037 dqm-grep -f '/*/Event*/***' -e 'run == 168330 and match("/StreamHLTMON/Run2011A-Express-v4/DQM", dataset)'
0038 dqm-grep -f '/C*/***' -e 'run == 168330 and match("/.*/Run2011A-Express-v4/DQM", dataset)'
0039 dqm-grep -f '/*/EventInfo/CertificationSummary' -e 'match("/StreamExpress.*/Run2011A-Express-v4/DQM", dataset)'
0040
0041 In order to authenticate to the target server, standard grid certificate
0042 environment must be available. Typically this would be X509_CERT_DIR and
0043 either X509_USER_PROXY or X509_USER_CERT and X509_USER_KEY environment
0044 variables. If these variables are not set, the following defaults are
0045 checked for existence. Note that if the script falls back on using a
0046 key rather than a proxy, it will prompt for the key password.
0047 - $X509_CERT_DIR: /etc/grid-security/certificates
0048 - $X509_USER_KEY: $HOME/.globus/userkey.pem
0049 - $X509_USER_CERT: $HOME/.globus/usercert.pem
0050 """
0051
0052 from DQMServices.Components.HTTP import RequestManager
0053 from DQMServices.Components.X509 import SSLOptions
0054 import sys, re, json, pycurl
0055 from urllib.parse import quote
0056 from optparse import OptionParser
0057 from time import time
0058
0059 # Object types.
0060 DIR = 0 # Directory
0061 FILE = 1 # File / simple object.
0062 ANY = 2 # Either; used only for filters.
0063
0064 # HTTP protocol `User-agent` identification string.
0065 ident = "DQMGrep/1.0 python/%s.%s.%s" % sys.version_info[:3]
0066
0067 # Where to find JSON contents at a given server.
0068 url_content = "/%(section)s/%(run)d%(dataset)s%(path)s"
0069
0070 # SSL/X509 options.
0071 ssl_opts = None
0072
0073 # HTTP request manager for content requests.
0074 reqman = None
0075
0076 # Number of HTTP requests made for content.
0077 nreq = 0
0078
0079 # Found objects (per sample).
0080 found = []
0081
0082 class filter:
0083 """One step of a search filter.
0084
0085 - `type`: the type of object the filter can match: `FILE`, `DIR` or `ANY`.
0086 - `recurse`: apply the pattern recursively to subdirectories if True.
0087 - `pattern`: the regular expression pattern as a string.
0088 - `rx`: the regular expression as a compiled regexp object."""
0089 type = FILE
0090 recurse = False
0091 pattern = ""
0092 rx = None
0093
0094 def __repr__(self):
0095 return "(filter pattern='%s' type=%s recurse=%s)" \
0096 % (self.pattern, self.type, self.recurse)
0097
0098 def pattern_to_filter(pattern):
0099 """Converts a search pattern into a search filter.
0100
0101 The pattern must be of the form of path with '*' wild card pattern, for
0102 example "/*/EventInfo/*Summary". A single star matches any string except
0103 slashes, i.e. matching within a single directory. A double star matches
0104 directories recursively.
0105
0106 A name with trailing slash matches directory. A name without the trailing
0107 slash matches non-directories. A triple star will match either directory
0108 or non-directory.
0109
0110 The patterns match against the full path, and therefore must always start
0111 with a slash. If you want to search entire tree, use "/**/Name*".
0112
0113 The pattern "/*/EventInfo/" matches folders named 'EventInfo' one level
0114 down from the top; it will not recurse further down in the tree as it is
0115 known matches deeper inside are not possible. "/*/EventInfo/**/*Summary"
0116 pattern matches any non-directory object name ending in "Summary" anywhere
0117 inside "EventInfo" one level down from the top.
0118
0119 Returns a list of `filter` expressions representing the pattern, each of
0120 which represents one or more levels of matching/recursion.
0121 """
0122 filters = []
0123
0124 # Check the pattern starts with '/'
0125 if not pattern.startswith("/"):
0126 raise ValueError("pattern must start with slash")
0127
0128 # Process pattern as directory search specs, but collapse
0129 # repeated slashes into one slash first.
0130 for part in re.sub("/+", "/", pattern).split('/')[1:]:
0131 if filters and filters[-1].type == FILE:
0132 filters[-1].type = DIR
0133 f = filter()
0134 filters.append(f)
0135 for term in re.split("([*]+)", part):
0136 if term == "***":
0137 f.pattern += ".*"
0138 f.recurse = True
0139 f.type = ANY
0140 elif term == "**":
0141 f.pattern += ".*"
0142 f.recurse = True
0143 f.type = DIR
0144 elif term == "*":
0145 f.pattern += "[^/]*"
0146 f.type = FILE
0147 elif term:
0148 f.pattern += re.escape(term)
0149 f.type = FILE
0150 if f.pattern != ".*":
0151 f.pattern = "^%s$" % f.pattern
0152 f.rx = re.compile(f.pattern)
0153
0154 return filters
0155
0156 def should_process_sample(s, expr):
0157 """Evaluate sample predicate expression `expr` against sample `s`.
0158 Returns True if the sample should be processed, False otherwise."""
0159 try:
0160 s['match'] = lambda rx, str: re.match(rx, str)
0161 s['run'] = int(s['run'])
0162 val = eval(expr, {}, s)
0163 del s['match']
0164 return val
0165 except:
0166 return False
0167
0168 def find_matching_samples(options):
0169 """Generator which returns all samples at target sever which
0170 match the requested predicate expression."""
0171 all_samples = {}
0172
0173 def req_error(c, url, errmsg, errno):
0174 print("%s: failed to retrieve samples: %s (%d)" \
0175 % (options.server, errmsg, errno), file=sys.stderr)
0176 sys.exit(1)
0177
0178 def req_done(c):
0179 json_decoder = json.decoder.JSONDecoder()
0180 all_samples['result'] = json_decoder.decode(c.buffer.getvalue().decode('utf-8'))
0181
0182 reqman = RequestManager(ssl_opts = ssl_opts,
0183 user_agent = ident,
0184 request_respond = req_done,
0185 request_error = req_error)
0186 reqman.put((options.server + "/samples",))
0187 reqman.process()
0188
0189 if not all_samples:
0190 print("%s: no samples" % options.server, file=sys.stderr)
0191 sys.exit(1)
0192
0193 for sample_type in all_samples['result']['samples']:
0194 for sample in sample_type['items']:
0195 if should_process_sample(sample, options.sample_expr):
0196 yield sample
0197
0198 def request_init(c, options, sample, filters, pos, path):
0199 """`RequestManager` callback to initialise JSON contents request."""
0200 sample.update(path = path)
0201 c.url = options.server + quote(url_content % sample)
0202 c.setopt(pycurl.URL, c.url)
0203 if False and options.verbose:
0204 print(c.url)
0205
0206 def report_error(c, task, errmsg, errno):
0207 """`RequestManager` callback to report JSON contents request errors."""
0208 print("FAILED to retrieve %s: %s (%d)" % (task, errmsg, errno), file=sys.stderr)
0209
0210 def match_filters(item, filters, poslist):
0211 """Match filter list created by `pattern_to_filter` against an object.
0212
0213 The input arguments are:
0214 - `item`: JSON for the object from the server.
0215 - `filters`: List of all filters; not modified in any way.
0216 - `poslist`: The list of positions in `filters` where to search.
0217
0218 The searching initially begins with `poslist` equal to [0], i.e. the first
0219 filter. The function builds a new poslist to use for subdirectories. For
0220 each non-recursive filter, the old filter is effectively removed from the
0221 list and the next filter (if any) is added back. Recursive filters stay in
0222 the list, so effectively the `poslist` maintains a NFA search stack for all
0223 active search positions.
0224
0225 The function returns a tuple consisting of:
0226 - `name`: the name of the object that was matched, subdirectory or plain
0227 - `matched`: True if the entire filter chain has matched for this object
0228 - `descend`: True if this was a subdirectory the filters require to descend
0229 into; note that this is different from `matched`, basically non-terminal
0230 match on subdirectory objects
0231 - `poslist`: new filter position list for searching any subdirectories;
0232 this will be empty if the filter list has been exhausted with or without
0233 match.
0234 """
0235 newposlist = []
0236 descend = False
0237 matched = False
0238 name = None
0239
0240 for idx in poslist:
0241 assert idx < len(filters)
0242 f = filters[idx]
0243 fmatched = False
0244 if 'subdir' in item \
0245 and (f.type == DIR or f.type == ANY) \
0246 and f.rx.match(item['subdir']):
0247 descend = fmatched = True
0248 name = item['subdir']
0249 elif 'obj' in item \
0250 and (f.type == FILE or f.type == ANY) \
0251 and f.rx.match(item['obj']):
0252 fmatched = True
0253 name = item['obj']
0254
0255 if fmatched:
0256 if idx == len(filters)-1:
0257 matched = True
0258 if f.recurse:
0259 newposlist.append(idx)
0260 if idx < len(filters) - 1:
0261 newposlist.append(idx+1)
0262
0263 return name, matched, descend, newposlist
0264
0265 def process(c):
0266 """`RequestManager` callback to handle JSON content response.
0267
0268 This gets called once per every directory which has been successfully
0269 retrieved from the server. It basically applies `match_filters` to all
0270 objects found and requests subdirectories if necessary, and adds to
0271 `found` objects which matched the entire filter expression.
0272
0273 If verbosity has been requested, also shows simple progress bar on the
0274 search progress, one dot for every ten directories retrieved."""
0275 global found, nreq
0276 options, sample, filters, pos, path = c.task
0277 json_decoder = json.decoder.JSONDecoder()
0278
0279 nreq += 1
0280 if options.verbose and nreq % 10 == 0:
0281 sys.stdout.write(".")
0282 sys.stdout.flush()
0283 if nreq % 750 == 0:
0284 print()
0285
0286 reply = c.buffer.getvalue().decode('utf-8')
0287 reply = re.sub(r'("value": ")"([A-Za-z0-9_]+")"', r'\1\2', reply)
0288 reply = re.sub(r'("(?:mean|rms|min|max)":) nan,', r'\1 "NaN",', reply)
0289 reply = json_decoder.decode(reply)
0290
0291 seen = set()
0292 for item in reply['contents']:
0293 name, match, descend, newpos = match_filters(item, filters, pos)
0294 if match:
0295 found.append((path + name, item))
0296 if descend and name not in seen:
0297 reqman.put((options, sample, filters, newpos, path + name + "/"))
0298 seen.update((name,))
0299
0300 # Parse command line options.
0301 op = OptionParser(usage = __doc__)
0302 op.add_option("-e", "--samples", dest = "sample_expr", metavar = "EXPRESSION",
0303 help = "Evaluate EXPRESSION to decide which samples to scan")
0304 op.add_option("-f", "--filter", dest = "glob",
0305 type = "string", action = "store", metavar = "PATTERN",
0306 default = "/*/EventInfo/*Summary",
0307 help = "Filter monitor elements matching PATTERN")
0308 op.add_option("-s", "--server", dest = "server",
0309 type = "string", action = "store", metavar = "SERVER",
0310 default = "https://cmsweb.cern.ch/dqm/offline/data/json",
0311 help = "Pull content from SERVER")
0312 op.add_option("-n", "--connections", dest = "connections",
0313 type = "int", action = "store", metavar = "NUM",
0314 default = 10, help = "Use NUM concurrent connections")
0315 op.add_option("-v", "--verbose", dest = "verbose",
0316 action = "store_true", default = False,
0317 help = "Show verbose scan information")
0318 options, args = op.parse_args()
0319 if args:
0320 print("Too many arguments", file=sys.stderr)
0321 sys.exit(1)
0322 if not options.sample_expr:
0323 print("Sample predicate expression required", file=sys.stderr)
0324 sys.exit(1)
0325 if not options.glob:
0326 print("Monitor element filter expression required", file=sys.stderr)
0327 sys.exit(1)
0328 if not options.server:
0329 print("Server contact string required", file=sys.stderr)
0330 sys.exit(1)
0331
0332 # Get SSL X509 parametres.
0333 ssl_opts = SSLOptions()
0334 if options.verbose:
0335 print("Using SSL cert dir", ssl_opts.ca_path)
0336 print("Using SSL private key", ssl_opts.key_file)
0337 print("Using SSL public key", ssl_opts.cert_file)
0338
0339 # Convert glob pattern into a filter expression.
0340 filters = pattern_to_filter(options.glob)
0341
0342 # Start a request manager for contents.
0343 reqman = RequestManager(num_connections = options.connections,
0344 ssl_opts = ssl_opts,
0345 user_agent = ident,
0346 request_init = request_init,
0347 request_respond = process,
0348 request_error = report_error)
0349
0350 # Process all samples matching the predicate.
0351 ntotreq = 0
0352 nfound = 0
0353 start = time()
0354 for sample in find_matching_samples(options):
0355 nreq = 0
0356 found = []
0357 sample['section'] = 'archive'
0358 if options.verbose:
0359 print("Scanning %s" % sample)
0360 reqman.put((options, sample, filters, [0], "/"))
0361 reqman.process()
0362 if options.verbose:
0363 print()
0364 if found:
0365 print("%(section)s/%(run)d%(dataset)s:" % sample)
0366 found.sort()
0367 for path, item in found:
0368 if 'subdir' in item:
0369 print(" %s/" % path)
0370 elif 'value' in item:
0371 print(" %s = %s" % (path, item['value']))
0372 else:
0373 print(" %s = [%s # %d]" % (path, item['properties']['type'], item['nentries']))
0374 nfound += len(found)
0375 ntotreq += nreq
0376 end = time()
0377
0378 # Provide final summary.
0379 if options.verbose:
0380 print("\nFound %d objects in %d directories in %.3f seconds" % (nfound, ntotreq, end - start))
0381