Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-04-28 02:56:50

0001 from __future__ import print_function
0002 
0003 # ===================================================================
0004 # So to get the list of input files. One must call :
0005 #   search(), to get the list of primary files
0006 #   search2(), to get the list of eventual secondary files
0007 # 
0008 # The selection of files is configured thanks to shell
0009 # environment variables: 
0010 # 
0011 #   DD_RELEASE, for example CMSSW_2_2_0_pre1
0012 #   DD_SAMPLE, for example RelValSingleElectronPt35
0013 #   DD_RUN, for example ''
0014 #   DD_COND , for example MC_31X_V2-v1
0015 #   DD_TIER , for example RECO
0016 #   DD_TIER_SECONDARY, for eventual secondary files
0017 #   
0018 #   DD_SOURCE:
0019 #     das: use das
0020 #     dbs: use dbs search
0021 #     lsf: use dbs lsf
0022 #     /castor/cern.ch/cms/...: assumed to be the path of a castor directory containing the input data files
0023 #       for relvals: '/castor/cern.ch/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
0024 #       for harvested dqm: '/castor/cern.ch/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
0025 #     /eos/cms/...: assumed to be the path of a castor directory containing the input data files
0026 #       for relvals: '/eos/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
0027 #       for harvested dqm: '/eos/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
0028 #     /...: assumed to be the path of a text file containing the list of input data files
0029 #
0030 # All except DD_SOURCE can use wildcard *.
0031 # ===================================================================
0032 
0033 import os, sys, re  # , das_client
0034 import httplib, urllib, urllib2, types, string  # , os, sys
0035 import Utilities.General.cmssw_das_client as das_client
0036 import json
0037 from json import loads, dumps
0038 
0039 if 'DD_SOURCE' not in os.environ:
0040     os.environ['DD_SOURCE'] = 'das'
0041 if 'DD_RELEASE' not in os.environ:
0042     os.environ['DD_RELEASE'] = ''
0043 if 'DD_SAMPLE' not in os.environ:
0044     os.environ['DD_SAMPLE'] = ''
0045 if 'DD_COND' not in os.environ:
0046     os.environ['DD_COND'] = ''
0047 if 'DD_TIER' not in os.environ:
0048     os.environ['DD_TIER'] = ''
0049 if 'DD_TIER_SECONDARY' not in os.environ:
0050     os.environ['DD_TIER_SECONDARY'] = ''
0051 if 'DD_RUN' not in os.environ:
0052     os.environ['DD_RUN'] = ''
0053 
0054 dd_release_re = re.compile(os.environ['DD_RELEASE'].replace('*', '.*'));
0055 dd_sample_re = re.compile(os.environ['DD_SAMPLE'].replace('*', '.*'));
0056 dd_cond_re = re.compile(os.environ['DD_COND'].replace('*', '.*'));
0057 dd_run_re = re.compile(os.environ['DD_RUN'].replace('*', '.*'));
0058 
0059 
0060 def common_search(dd_tier):
0061     dd_tier_re = re.compile(dd_tier.replace('*', '.*'));
0062 
0063     if os.environ['DD_SOURCE'] == "das":
0064 
0065         query = "dataset instance=cms_dbs_prod_global"
0066         if os.environ['DD_RELEASE'] != "":
0067             query = query + " release=" + os.environ['DD_RELEASE']
0068         if os.environ['DD_SAMPLE'] != "":
0069             query = query + " primary_dataset=" + os.environ['DD_SAMPLE']
0070         if dd_tier != "":
0071             query = query + " tier=" + dd_tier
0072         if os.environ['DD_COND'] != "":
0073             query = query + " dataset=*" + os.environ['DD_COND'] + "*"
0074         if os.environ['DD_RUN'] != "":
0075             query = query + " run=" + os.environ['DD_RUN']
0076         # query = query + " | unique" # too long ??
0077 
0078         # data = os.popen('das_client.py --limit=0 --query "'+query+'"')
0079         # datalines = data.readlines()
0080         # data.close()
0081         # datasets = []
0082         # for line in datalines:
0083         #  line = line.rstrip()
0084         #  if line != "" and line[0] =="/":
0085         #    datasets.append(line)
0086         # dataset = datasets[0]
0087 
0088         data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch', query, 0, 0, 0))
0089 
0090         if data['nresults'] == 0:
0091             print('[electronDataDiscovery.py] No DAS dataset for query:', query)
0092             return []
0093         while data['nresults'] > 1:
0094             if data['data'][0]['dataset'][0]['name'] == data['data'][1]['dataset'][0]['name']:
0095                 data['data'].pop(0)
0096                 data['nresults'] -= 1
0097             else:
0098                 print('[electronDataDiscovery.py] Several DAS datasets for query:', query)
0099                 for i in range(data['nresults']):
0100                     print(
0101                         '[electronDataDiscovery.py] dataset[' + str(i) + ']: ' + data['data'][i]['dataset'][0]['name'])
0102                 return []
0103 
0104         dataset = data['data'][0]['dataset'][0]['name']
0105 
0106         query = "file instance=cms_dbs_prod_global dataset=" + dataset
0107 
0108         # data = os.popen('das_client.py --limit=0 --query "'+query+'"')
0109         # datalines = data.readlines()
0110         # data.close()
0111         # result = []
0112         # for line in datalines:
0113         #  line = line.rstrip()
0114         #  if line != "" and line[0] =="/":
0115         #    result.append(line)
0116 
0117         data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch', query, 0, 0, 0))
0118 
0119         if data['nresults'] == 0:
0120             print('[electronDataDiscovery.py] No DAS file in dataset:', dataset)
0121             return []
0122         else:
0123             print('there is %d results' % nresults)
0124 
0125         result = []
0126         for i in range(0, data['nresults']):
0127             result.append(str(data['data'][i]['file'][0]['name']))
0128 
0129     elif os.environ['DD_SOURCE'] == "dbs":
0130 
0131         input = "find file"
0132         separator = " where "
0133         if os.environ['DD_RELEASE'] != "":
0134             input = input + separator + "release = " + os.environ['DD_RELEASE']
0135             separator = " and "
0136         if os.environ['DD_SAMPLE'] != "":
0137             input = input + separator + "primds = " + os.environ['DD_SAMPLE']
0138             separator = " and "
0139         if os.environ['DD_RUN'] != "":
0140             input = input + separator + "run = " + os.environ['DD_RUN']
0141             separator = " and "
0142         input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
0143 
0144         data = os.popen(
0145             'dbs search --url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" --query "' + input + '"')
0146         datalines = data.readlines()
0147         data.close()
0148         result = []
0149         for line in datalines:
0150             line = line.rstrip()
0151             if line != "" and line[0] == "/":
0152                 result.append(line)
0153 
0154     elif os.environ['DD_SOURCE'] == "http":
0155 
0156         input = "find file"
0157         separator = " where "
0158         if os.environ['DD_RELEASE'] != "":
0159             input = input + separator + "release = " + os.environ['DD_RELEASE']
0160             separator = " and "
0161         if os.environ['DD_SAMPLE'] != "":
0162             input = input + separator + "primds = " + os.environ['DD_SAMPLE']
0163             separator = " and "
0164         if os.environ['DD_RUN'] != "":
0165             input = input + separator + "run = " + os.environ['DD_RUN']
0166             separator = " and "
0167         input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
0168 
0169         url = "https://cmsweb.cern.ch:443/dbs_discovery/aSearch"
0170         final_input = urllib.quote(input);
0171 
0172         agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
0173         ctypes = "text/plain"
0174         headers = {'User-Agent': agent, 'Accept': ctypes}
0175         params = {'dbsInst': 'cms_dbs_prod_global',
0176                   'html': 0, 'caseSensitive': 'on', '_idx': 0, 'pagerStep': -1,
0177                   'userInput': final_input,
0178                   'xml': 0, 'details': 0, 'cff': 0, 'method': 'dbsapi'}
0179         data = urllib.urlencode(params, doseq=True)
0180         req = urllib2.Request(url, data, headers)
0181         data = ""
0182 
0183         try:
0184             response = urllib2.urlopen(req)
0185             data = response.read()
0186         except urllib2.HTTPError as e:
0187             if e.code == 201:
0188                 print(e.headers)
0189                 print(e.msg)
0190                 pass
0191             else:
0192                 raise e
0193 
0194         datalines = data.readlines()
0195         data.close()
0196         result = []
0197         for line in datalines:
0198             line = line.rstrip()
0199             if line != "" and line[0] == "/":
0200                 result.append(line)
0201 
0202     elif os.environ['DD_SOURCE'] == "lsf":
0203 
0204         dbs_path = '/' + os.environ['DD_SAMPLE'] + '/' + os.environ['DD_RELEASE'] + '-' + os.environ['DD_COND'] + '/' + \
0205                    os.environ['DD_TIER'] + '"'
0206         if __name__ == "__main__":
0207             print('dbs path:', dbs_path)
0208         data = os.popen('dbs lsf --path="' + dbs_path + '"')
0209         datalines = data.readlines()
0210         data.close()
0211         result = []
0212         for line in datalines:
0213             line = line.rstrip()
0214             if line != "" and line[0] == "/":
0215                 result.append(line)
0216 
0217     elif os.environ['DD_SOURCE'].startswith('/castor/cern.ch/cms/'):  # assumed to be a castor dir
0218 
0219         castor_dir = os.environ['DD_SOURCE'].replace('/castor/cern.ch/cms/', '/', 1)
0220         result = []
0221         data = os.popen('rfdir /castor/cern.ch/cms' + castor_dir)
0222         subdirs = data.readlines()
0223         data.close()
0224         datalines = []
0225         for line in subdirs:
0226             line = line.rstrip()
0227             subdir = line.split()[8]
0228             data = os.popen('rfdir /castor/cern.ch/cms' + castor_dir + '/' + subdir)
0229             datalines = data.readlines()
0230             for line in datalines:
0231                 line = line.rstrip()
0232                 file = line.split()[8]
0233                 if file != "":
0234                     result.append(castor_dir + '/' + subdir + '/' + file)
0235             data.close()
0236 
0237     elif os.environ['DD_SOURCE'].startswith('/eos/cms/'):  # assumed to be an eos dir
0238 
0239         data = os.popen('eos find -f ' + os.environ['DD_SOURCE'])
0240         lines = data.readlines()
0241         data.close()
0242         result = []
0243         for line in lines:
0244             line = line.strip().replace('/eos/cms/', '/', 1)
0245             if line == "": continue
0246             if dd_sample_re.search(line) == None: continue
0247             if dd_cond_re.search(line) == None: continue
0248             if dd_tier_re.search(line) == None: continue
0249             if dd_run_re.search(line) == None: continue
0250             result.append(line)
0251 
0252     else:  # os.environ['DD_SOURCE'] is assumed to be a file name
0253 
0254         result = []
0255         for line in open(os.environ['DD_SOURCE']).readlines():
0256             line = os.path.expandvars(line.strip())
0257             if line == "": continue
0258             if dd_sample_re.search(line) == None: continue
0259             if dd_cond_re.search(line) == None: continue
0260             if dd_tier_re.search(line) == None: continue
0261             if dd_run_re.search(line) == None: continue
0262             result.append(line)
0263 
0264         if len(result) == 0:
0265             diag = '[electronDataDiscovery.py] No more files after filtering with :'
0266             if os.environ['DD_SAMPLE'] != '': diag += ' ' + os.environ['DD_SAMPLE']
0267             if os.environ['DD_COND'] != '': diag += ' ' + os.environ['DD_COND']
0268             if dd_tier != '': diag += ' ' + dd_tier
0269             if os.environ['DD_RUN'] != '': diag += ' ' + os.environ['DD_RUN']
0270             print(diag)
0271 
0272     return result
0273 
0274 
0275 def search():
0276     print('search in %s' % 'DD_TIER')
0277     return common_search(os.environ['DD_TIER'])
0278 
0279 
0280 def search2():
0281     return common_search(os.environ['DD_TIER_SECONDARY'])
0282 
0283 
0284 def getCMSdata(data, dbs="prod/global"):
0285     # Read DAS database.
0286     cmd = 'dasgoclient --query="file dataset=DATA instance=DBS" | sort'
0287     cmd2 = cmd.replace('DATA', data).replace('DBS', dbs)
0288     files = os.popen(cmd2).read()
0289     # Create python list containing file names.
0290     flist = files.split('\n')
0291     del flist[-1]
0292     return flist