Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-26 02:34:35

0001 #! /usr/bin/env python3
0002 '''
0003 Script fetches files matching specified RegExps from DQM GUI.
0004 
0005 Author:  Albertas Gimbutas,  Vilnius University (LT)
0006 e-mail:  albertasgim@gmail.com
0007 '''
0008 ################################################################################
0009 # Change logs:
0010 # 2012-10-22 11:31 - Checking to Download also files <1MB (like GEN samples)
0011 # 2012-07-09 16:10 - BugFix: RELEASE has to be in selected file names.
0012 # 2012-07-09 16:10 - Added How-To examples and command line option
0013 # explanations for -h option.
0014 # 2012-07-08 17:23 - Added file download in bunches.
0015 # 2012-07-06 14:30 - Added multiprocessing for file download:
0016 # http://docs.python.org/library/multiprocessing
0017 # 2012-07-06 14:09 - Added new commandline options implmenetation.
0018 # 2012-07-06 09:48 - fixed ``--data`` commandline option small bug. Now it
0019 # does not requires to specifie its value.
0020 ################################################################################
0021 
0022 import re
0023 import sys
0024 import os
0025 
0026 from multiprocessing import Pool, Queue, Process
0027 from Queue import Empty
0028 from os.path import basename, isfile
0029 from optparse import OptionParser
0030 from urllib2 import build_opener, Request
0031 
0032 try:
0033     from Utilities.RelMon.authentication import X509CertOpen
0034 except ImportError:
0035     from authentication import X509CertOpen
0036 
0037 
0038 def auth_wget(url, chunk_size=1048576):
0039     """Returns the content of specified URL, which requires authentication.
0040     If the content is bigger than 1MB, then save it to file.
0041     """
0042     opener = build_opener(X509CertOpen())
0043     url_file = opener.open(Request(url))
0044     size = int(url_file.headers["Content-Length"])
0045 
0046     if size < 1048576:   # if File size < 1MB
0047         filename = basename(url)    #still download
0048         readed = url_file.read()    ## and then check if its not an empty dir (parent directory)
0049         if filename != '':
0050             outfile = open(filename, 'wb')  #then write File to local system
0051             outfile.write(readed)
0052         return readed
0053 
0054     filename = basename(url)
0055     file_id = selected_files.index(filename)
0056 
0057     if isfile("./%s" % filename):
0058         print('%d. Exsits on disk. Skipping.' % (file_id +1))
0059         return
0060 
0061     print('%d. Downloading...' % (file_id +1))
0062     file = open(filename, 'wb')
0063     # progress = 0
0064     chunk = url_file.read(chunk_size)
0065     while chunk:
0066         file.write(chunk)
0067         # progress += chunk_size
0068         chunk = url_file.read(chunk_size)
0069     print('%d.  Done.' % (file_id +1))
0070     file.close()
0071 
0072 
0073 ## Define options
0074 parser = OptionParser(usage='usage: %prog [options]')
0075 parser.add_option('-d', '--data', action='store_true', dest='is_from_data',
0076                   help='Fetch data relvals.')
0077 parser.add_option('-m', '--mc', action='store_false', dest='is_from_data',
0078                   help='Fetch Monte Carlo relvals.')
0079 parser.add_option('-r', '--release', action='store', dest='release',
0080                   help='Release to fetch from. RELEASE format "CMSSW_x_x_x", e.g. CMSSW_5_3_2.')
0081 parser.add_option('-e', '--re', '--regexp', action='store', dest='regexp', default='',
0082         help='Comma separated regular expresions for file names. e.g. to fetch '+
0083         'files, which names contain "cos" or "jet" and does not contain "2010", use: '+
0084         '"cos,jet,^((?!2010).)*$".')
0085 parser.add_option('--mthreads', action='store', default='3', dest='mthreads',
0086                   help='Number of threads for file download. Default is 3.')
0087 parser.add_option('--dry', action='store_true', default=False, dest='dry_run',
0088                   help='Show files matched by regular expresion, but do not download them.')
0089 ## Parse sys.argv
0090 (options, args) = parser.parse_args()
0091 options.release = options.release.strip('"\'=')
0092 options.regexp = options.regexp.strip('"\'=')
0093 
0094 ## Check for option errors
0095 if options.is_from_data is None:
0096     parser.error('You have to specify the directory, use --mc for "RelVal" or ' +
0097                  '--data for "RelValData"')
0098 elif options.release is None:
0099     parser.error('You have to specify the CMSSW release, use --release option. ' +
0100                  'E.g. --release CMSSW_5_3_2')
0101 elif not options.mthreads.isdigit():
0102     parser.error('Bad --mthreads argument format. It has to be integer. E.g. ' +
0103                  '--mthreads 3')
0104 
0105 ## Use options
0106 relvaldir = "RelVal"
0107 if options.is_from_data:
0108     relvaldir = "RelValData"
0109 
0110 release = re.findall('(CMSSW_\d*_\d*_)\d*(?:_[\w\d]*)?', options.release)
0111 if not release:
0112     parser.error('No such CMSSW release found. Please check the ``--release`` commandline option value.')
0113 releasedir = release[0] + "x"
0114 
0115 base_url = 'https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/'
0116 filedir_url = base_url + relvaldir + '/' + releasedir + '/'
0117 filedir_html = auth_wget(filedir_url)
0118 
0119 #auth_wget("https://cmsweb.cern.ch/dqm/offline/data/browse/ROOT/OfflineData/Run2012/JetHT/0002029xx/DQM_V0001_R000202950__JetHT__Run2012C-PromptReco-v2__DQM.root")
0120 #auth_wget("https://cmsweb.cern.ch/dqm/relval/data/browse/ROOT/RelValData/CMSSW_5_3_x/DQM_V0001_R000205921__JetHT__CMSSW_5_3_3_patch1-PR_newconditions_RelVal_R205921_121105-v2__DQM.root")
0121 
0122 file_list_re = re.compile(r"<a href='[-./\w]*'>([-./\w]*)<")
0123 all_files = file_list_re.findall(filedir_html)[1:]  # list of file names
0124 
0125 options.mthreads = int(options.mthreads)
0126 if options.mthreads > 3 or options.mthreads < 1:
0127     options.mthreads = 3
0128 
0129 ### Fetch the files, using multi-processing
0130 file_res = [re.compile(r) for r in options.regexp.split(',') + [options.release]]
0131 selected_files = [f for f in all_files if all([r.search(f) for r in file_res])]
0132 
0133 print('Downloading files:')
0134 for i, name in enumerate(selected_files):
0135     print('%d. %s' % (i+1, name))
0136 
0137 if not options.dry_run:
0138     print('\nProgress:')
0139     pool = Pool(options.mthreads)
0140     pool.map(auth_wget, [filedir_url + name for name in selected_files])