RelMon/scripts/fetchall_from_DQM_v2.py

0001 #! /usr/bin/env python3
0002 '''
0003 Script fetches files matching specified RegExps from DQM GUI.
0004
0005 Author:  Albertas Gimbutas,  Vilnius University (LT)
0006 e-mail:  albertasgim@gmail.com
0007 '''
0008 from __future__ import print_function
0009 ################################################################################
0010 # Change logs:
0011 # 2012-10-22 11:31 - Checking to Download also files <1MB (like GEN samples)
0012 # 2012-07-09 16:10 - BugFix: RELEASE has to be in selected file names.
0013 # 2012-07-09 16:10 - Added How-To examples and command line option
0014 # explanations for -h option.
0015 # 2012-07-08 17:23 - Added file download in bunches.
0016 # 2012-07-06 14:30 - Added multiprocessing for file download:
0017 # http://docs.python.org/library/multiprocessing
0018 # 2012-07-06 14:09 - Added new commandline options implmenetation.
0019 # 2012-07-06 09:48 - fixed ``--data`` commandline option small bug. Now it
0020 # does not requires to specifie its value.
0021 ################################################################################
0022
0023 import re
0024 import sys
0025 import os
0026
0027 from multiprocessing import Pool, Queue, Process
0028 from Queue import Empty
0029 from os.path import basename, isfile
0030 from optparse import OptionParser
0031 from urllib2 import build_opener, Request
0032
0033 try:
0034     from Utilities.RelMon.authentication import X509CertOpen
0035 except ImportError:
0036     from authentication import X509CertOpen
0037
0038
0039 def auth_wget(url, chunk_size=1048576):
0040     """Returns the content of specified URL, which requires authentication.
0041     If the content is bigger than 1MB, then save it to file.
0042     """
0043     opener = build_opener(X509CertOpen())
0044     url_file = opener.open(Request(url))
0045     size = int(url_file.headers["Content-Length"])
0046
0047     if size < 1048576:   # if File size < 1MB
0048         filename = basename(url)    #still download
0049         readed = url_file.read()    ## and then check if its not an empty dir (parent directory)
0050         if filename != '':
0051             outfile = open(filename, 'wb')  #then write File to local system
0052             outfile.write(readed)
0053         return readed
0054
0055     filename = basename(url)
0056     file_id = selected_files.index(filename)
0057
0058     if isfile("./%s" % filename):
0059         print('%d. Exsits on disk. Skipping.' % (file_id +1))
0060         return
0061
0062     print('%d. Downloading...' % (file_id +1))
0063     file = open(filename, 'wb')
0064     # progress = 0
0065     chunk = url_file.read(chunk_size)
0066     while chunk:
0067         file.write(chunk)
0068         # progress += chunk_size
0069         chunk = url_file.read(chunk_size)
0070     print('%d.  Done.' % (file_id +1))
0071     file.close()
0072
0073
0074 ## Define options
0075 parser = OptionParser(usage='usage: %prog [options]')
0076 parser.add_option('-d', '--data', action='store_true', dest='is_from_data',
0077                   help='Fetch data relvals.')
0078 parser.add_option('-m', '--mc', action='store_false', dest='is_from_data',
0079                   help='Fetch Monte Carlo relvals.')
0080 parser.add_option('-r', '--release', action='store', dest='release',
0081                   help='Release to fetch from. RELEASE format "CMSSW_x_x_x", e.g. CMSSW_5_3_2.')
0082 parser.add_option('-e', '--re', '--regexp', action='store', dest='regexp', default='',
0083         help='Comma separated regular expresions for file names. e.g. to fetch '+
0084         'files, which names contain "cos" or "jet" and does not contain "2010", use: '+
0085         '"cos,jet,^((?!2010).)*$".')
0086 parser.add_option('--mthreads', action='store', default='3', dest='mthreads',
0087                   help='Number of threads for file download. Default is 3.')
0088 parser.add_option('--dry', action='store_true', default=False, dest='dry_run',
0089                   help='Show files matched by regular expresion, but do not download them.')
0090 ## Parse sys.argv
0091 (options, args) = parser.parse_args()
0092 options.release = options.release.strip('"\'=')
0093 options.regexp = options.regexp.strip('"\'=')
0094
0095 ## Check for option errors
0096 if options.is_from_data is None:
0097     parser.error('You have to specify the directory, use --mc for "RelVal" or ' +
0098                  '--data for "RelValData"')
0099 elif options.release is None:
0100     parser.error('You have to specify the CMSSW release, use --release option. ' +
0101                  'E.g. --release CMSSW_5_3_2')
0102 elif not options.mthreads.isdigit():
0103     parser.error('Bad --mthreads argument format. It has to be integer. E.g. ' +
0104                  '--mthreads 3')
0105
0106 ## Use options
0107 relvaldir = "RelVal"
0108 if options.is_from_data:
0109     relvaldir = "RelValData"
0110
0111 release = re.findall('(CMSSW_\d*_\d*_)\d*(?:_[\w\d]*)?', options.release)
0112 if not release:
0113     parser.error('No such CMSSW release found. Please check the ``--release`` commandline option value.')
0114 releasedir = release[0] + "x"
0115
0116 base_url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/'
0117 filedir_url = base_url + relvaldir + '/' + releasedir + '/'
0118 filedir_html = auth_wget(filedir_url)
0119
0120 #auth_wget("https://cmsweb.cern.ch/dqm/offline/data/browse/ROOT/OfflineData/Run2012/JetHT/0002029xx/DQM_V0001_R000202950__JetHT__Run2012C-PromptReco-v2__DQM.root")
0121 #auth_wget("https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/CMSSW_5_3_x/DQM_V0001_R000205921__JetHT__CMSSW_5_3_3_patch1-PR_newconditions_RelVal_R205921_121105-v2__DQM.root")
0122
0123 file_list_re = re.compile(r"<a href='[-./\w]*'>([-./\w]*)<")
0124 all_files = file_list_re.findall(filedir_html)[1:]  # list of file names
0125
0126 options.mthreads = int(options.mthreads)
0127 if options.mthreads > 3 or options.mthreads < 1:
0128     options.mthreads = 3
0129
0130 ### Fetch the files, using multi-processing
0131 file_res = [re.compile(r) for r in options.regexp.split(',') + [options.release]]
0132 selected_files = [f for f in all_files if all([r.search(f) for r in file_res])]
0133
0134 print('Downloading files:')
0135 for i, name in enumerate(selected_files):
0136     print('%d. %s' % (i+1, name))
0137
0138 if not options.dry_run:
0139     print('\nProgress:')
0140     pool = Pool(options.mthreads)
0141     pool.map(auth_wget, [filedir_url + name for name in selected_files])