EGamma/python/electronDataDiscovery.py

0001
0002 # ===================================================================
0003 # So to get the list of input files. One must call :
0004 #   search(), to get the list of primary files
0005 #   search2(), to get the list of eventual secondary files
0006 #
0007 # The selection of files is configured thanks to shell
0008 # environment variables:
0009 #
0010 #   DD_RELEASE, for example CMSSW_2_2_0_pre1
0011 #   DD_SAMPLE, for example RelValSingleElectronPt35
0012 #   DD_RUN, for example ''
0013 #   DD_COND , for example MC_31X_V2-v1
0014 #   DD_TIER , for example RECO
0015 #   DD_TIER_SECONDARY, for eventual secondary files
0016 #
0017 #   DD_SOURCE:
0018 #     das: use das
0019 #     dbs: use dbs search
0020 #     lsf: use dbs lsf
0021 #     /castor/cern.ch/cms/...: assumed to be the path of a castor directory containing the input data files
0022 #       for relvals: '/castor/cern.ch/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
0023 #       for harvested dqm: '/castor/cern.ch/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
0024 #     /eos/cms/...: assumed to be the path of a castor directory containing the input data files
0025 #       for relvals: '/eos/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
0026 #       for harvested dqm: '/eos/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
0027 #     /...: assumed to be the path of a text file containing the list of input data files
0028 #
0029 # All except DD_SOURCE can use wildcard *.
0030 # ===================================================================
0031
0032 import os, sys, re  # , das_client
0033 import httplib, urllib, urllib2, types, string  # , os, sys
0034 import Utilities.General.cmssw_das_client as das_client
0035 import json
0036 from json import loads, dumps
0037
0038 if 'DD_SOURCE' not in os.environ:
0039     os.environ['DD_SOURCE'] = 'das'
0040 if 'DD_RELEASE' not in os.environ:
0041     os.environ['DD_RELEASE'] = ''
0042 if 'DD_SAMPLE' not in os.environ:
0043     os.environ['DD_SAMPLE'] = ''
0044 if 'DD_COND' not in os.environ:
0045     os.environ['DD_COND'] = ''
0046 if 'DD_TIER' not in os.environ:
0047     os.environ['DD_TIER'] = ''
0048 if 'DD_TIER_SECONDARY' not in os.environ:
0049     os.environ['DD_TIER_SECONDARY'] = ''
0050 if 'DD_RUN' not in os.environ:
0051     os.environ['DD_RUN'] = ''
0052
0053 dd_release_re = re.compile(os.environ['DD_RELEASE'].replace('*', '.*'));
0054 dd_sample_re = re.compile(os.environ['DD_SAMPLE'].replace('*', '.*'));
0055 dd_cond_re = re.compile(os.environ['DD_COND'].replace('*', '.*'));
0056 dd_run_re = re.compile(os.environ['DD_RUN'].replace('*', '.*'));
0057
0058
0059 def common_search(dd_tier):
0060     dd_tier_re = re.compile(dd_tier.replace('*', '.*'));
0061
0062     if os.environ['DD_SOURCE'] == "das":
0063
0064         query = "dataset instance=cms_dbs_prod_global"
0065         if os.environ['DD_RELEASE'] != "":
0066             query = query + " release=" + os.environ['DD_RELEASE']
0067         if os.environ['DD_SAMPLE'] != "":
0068             query = query + " primary_dataset=" + os.environ['DD_SAMPLE']
0069         if dd_tier != "":
0070             query = query + " tier=" + dd_tier
0071         if os.environ['DD_COND'] != "":
0072             query = query + " dataset=*" + os.environ['DD_COND'] + "*"
0073         if os.environ['DD_RUN'] != "":
0074             query = query + " run=" + os.environ['DD_RUN']
0075         # query = query + " | unique" # too long ??
0076
0077         # data = os.popen('das_client.py --limit=0 --query "'+query+'"')
0078         # datalines = data.readlines()
0079         # data.close()
0080         # datasets = []
0081         # for line in datalines:
0082         #  line = line.rstrip()
0083         #  if line != "" and line[0] =="/":
0084         #    datasets.append(line)
0085         # dataset = datasets[0]
0086
0087         data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch', query, 0, 0, 0))
0088
0089         if data['nresults'] == 0:
0090             print('[electronDataDiscovery.py] No DAS dataset for query:', query)
0091             return []
0092         while data['nresults'] > 1:
0093             if data['data'][0]['dataset'][0]['name'] == data['data'][1]['dataset'][0]['name']:
0094                 data['data'].pop(0)
0095                 data['nresults'] -= 1
0096             else:
0097                 print('[electronDataDiscovery.py] Several DAS datasets for query:', query)
0098                 for i in range(data['nresults']):
0099                     print(
0100                         '[electronDataDiscovery.py] dataset[' + str(i) + ']: ' + data['data'][i]['dataset'][0]['name'])
0101                 return []
0102
0103         dataset = data['data'][0]['dataset'][0]['name']
0104
0105         query = "file instance=cms_dbs_prod_global dataset=" + dataset
0106
0107         # data = os.popen('das_client.py --limit=0 --query "'+query+'"')
0108         # datalines = data.readlines()
0109         # data.close()
0110         # result = []
0111         # for line in datalines:
0112         #  line = line.rstrip()
0113         #  if line != "" and line[0] =="/":
0114         #    result.append(line)
0115
0116         data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch', query, 0, 0, 0))
0117
0118         if data['nresults'] == 0:
0119             print('[electronDataDiscovery.py] No DAS file in dataset:', dataset)
0120             return []
0121         else:
0122             print('there is %d results' % nresults)
0123
0124         result = []
0125         for i in range(0, data['nresults']):
0126             result.append(str(data['data'][i]['file'][0]['name']))
0127
0128     elif os.environ['DD_SOURCE'] == "dbs":
0129
0130         input = "find file"
0131         separator = " where "
0132         if os.environ['DD_RELEASE'] != "":
0133             input = input + separator + "release = " + os.environ['DD_RELEASE']
0134             separator = " and "
0135         if os.environ['DD_SAMPLE'] != "":
0136             input = input + separator + "primds = " + os.environ['DD_SAMPLE']
0137             separator = " and "
0138         if os.environ['DD_RUN'] != "":
0139             input = input + separator + "run = " + os.environ['DD_RUN']
0140             separator = " and "
0141         input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
0142
0143         data = os.popen(
0144             'dbs search --url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" --query "' + input + '"')
0145         datalines = data.readlines()
0146         data.close()
0147         result = []
0148         for line in datalines:
0149             line = line.rstrip()
0150             if line != "" and line[0] == "/":
0151                 result.append(line)
0152
0153     elif os.environ['DD_SOURCE'] == "http":
0154
0155         input = "find file"
0156         separator = " where "
0157         if os.environ['DD_RELEASE'] != "":
0158             input = input + separator + "release = " + os.environ['DD_RELEASE']
0159             separator = " and "
0160         if os.environ['DD_SAMPLE'] != "":
0161             input = input + separator + "primds = " + os.environ['DD_SAMPLE']
0162             separator = " and "
0163         if os.environ['DD_RUN'] != "":
0164             input = input + separator + "run = " + os.environ['DD_RUN']
0165             separator = " and "
0166         input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
0167
0168         url = "https://cmsweb.cern.ch:443/dbs_discovery/aSearch"
0169         final_input = urllib.quote(input);
0170
0171         agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
0172         ctypes = "text/plain"
0173         headers = {'User-Agent': agent, 'Accept': ctypes}
0174         params = {'dbsInst': 'cms_dbs_prod_global',
0175                   'html': 0, 'caseSensitive': 'on', '_idx': 0, 'pagerStep': -1,
0176                   'userInput': final_input,
0177                   'xml': 0, 'details': 0, 'cff': 0, 'method': 'dbsapi'}
0178         data = urllib.urlencode(params, doseq=True)
0179         req = urllib2.Request(url, data, headers)
0180         data = ""
0181
0182         try:
0183             response = urllib2.urlopen(req)
0184             data = response.read()
0185         except urllib2.HTTPError as e:
0186             if e.code == 201:
0187                 print(e.headers)
0188                 print(e.msg)
0189                 pass
0190             else:
0191                 raise e
0192
0193         datalines = data.readlines()
0194         data.close()
0195         result = []
0196         for line in datalines:
0197             line = line.rstrip()
0198             if line != "" and line[0] == "/":
0199                 result.append(line)
0200
0201     elif os.environ['DD_SOURCE'] == "lsf":
0202
0203         dbs_path = '/' + os.environ['DD_SAMPLE'] + '/' + os.environ['DD_RELEASE'] + '-' + os.environ['DD_COND'] + '/' + \
0204                    os.environ['DD_TIER'] + '"'
0205         if __name__ == "__main__":
0206             print('dbs path:', dbs_path)
0207         data = os.popen('dbs lsf --path="' + dbs_path + '"')
0208         datalines = data.readlines()
0209         data.close()
0210         result = []
0211         for line in datalines:
0212             line = line.rstrip()
0213             if line != "" and line[0] == "/":
0214                 result.append(line)
0215
0216     elif os.environ['DD_SOURCE'].startswith('/castor/cern.ch/cms/'):  # assumed to be a castor dir
0217
0218         castor_dir = os.environ['DD_SOURCE'].replace('/castor/cern.ch/cms/', '/', 1)
0219         result = []
0220         data = os.popen('rfdir /castor/cern.ch/cms' + castor_dir)
0221         subdirs = data.readlines()
0222         data.close()
0223         datalines = []
0224         for line in subdirs:
0225             line = line.rstrip()
0226             subdir = line.split()[8]
0227             data = os.popen('rfdir /castor/cern.ch/cms' + castor_dir + '/' + subdir)
0228             datalines = data.readlines()
0229             for line in datalines:
0230                 line = line.rstrip()
0231                 file = line.split()[8]
0232                 if file != "":
0233                     result.append(castor_dir + '/' + subdir + '/' + file)
0234             data.close()
0235
0236     elif os.environ['DD_SOURCE'].startswith('/eos/cms/'):  # assumed to be an eos dir
0237
0238         data = os.popen('eos find -f ' + os.environ['DD_SOURCE'])
0239         lines = data.readlines()
0240         data.close()
0241         result = []
0242         for line in lines:
0243             line = line.strip().replace('/eos/cms/', '/', 1)
0244             if line == "": continue
0245             if dd_sample_re.search(line) == None: continue
0246             if dd_cond_re.search(line) == None: continue
0247             if dd_tier_re.search(line) == None: continue
0248             if dd_run_re.search(line) == None: continue
0249             result.append(line)
0250
0251     else:  # os.environ['DD_SOURCE'] is assumed to be a file name
0252
0253         result = []
0254         for line in open(os.environ['DD_SOURCE']).readlines():
0255             line = os.path.expandvars(line.strip())
0256             if line == "": continue
0257             if dd_sample_re.search(line) == None: continue
0258             if dd_cond_re.search(line) == None: continue
0259             if dd_tier_re.search(line) == None: continue
0260             if dd_run_re.search(line) == None: continue
0261             result.append(line)
0262
0263         if len(result) == 0:
0264             diag = '[electronDataDiscovery.py] No more files after filtering with :'
0265             if os.environ['DD_SAMPLE'] != '': diag += ' ' + os.environ['DD_SAMPLE']
0266             if os.environ['DD_COND'] != '': diag += ' ' + os.environ['DD_COND']
0267             if dd_tier != '': diag += ' ' + dd_tier
0268             if os.environ['DD_RUN'] != '': diag += ' ' + os.environ['DD_RUN']
0269             print(diag)
0270
0271     return result
0272
0273
0274 def search():
0275     print('search in %s' % 'DD_TIER')
0276     return common_search(os.environ['DD_TIER'])
0277
0278
0279 def search2():
0280     return common_search(os.environ['DD_TIER_SECONDARY'])
0281
0282
0283 def getCMSdata(data, dbs="prod/global"):
0284     # Read DAS database.
0285     cmd = 'dasgoclient --query="file dataset=DATA instance=DBS" | sort'
0286     cmd2 = cmd.replace('DATA', data).replace('DBS', dbs)
0287     files = os.popen(cmd2).read()
0288     # Create python list containing file names.
0289     flist = files.split('\n')
0290     del flist[-1]
0291     return flist