File indexing completed on 2024-04-06 12:33:16
0001
0002 from __future__ import print_function
0003
0004
0005
0006
0007 def import_ROOT():
0008 import sys
0009 tmpargv = sys.argv
0010 sys.argv = ['-b', '-n']
0011 import ROOT
0012 sys.argv[:] = tmpargv[:]
0013 return ROOT
0014
0015 import yaml
0016 import subprocess
0017 import logging
0018 import json
0019 import argparse
0020 import glob
0021 import multiprocessing
0022 import optparse
0023 import shlex
0024 import os
0025
0026 LOG_MODULE_NAME = logging.getLogger(__name__)
0027
0028 class Dataset:
0029
0030 """Datatype that represents a DAS dataset
0031
0032 Attributes:
0033 global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0034 name (string): The DAS name of the dataset
0035 process (string): The nickname for the physics process that this dataset belongs to
0036 """
0037
0038 def __init__(self, name, process, global_file_prefix, cache_location, use_cache, tmpdir):
0039 """Summary
0040
0041 Args:
0042 name (string): The DAS name of the dataset
0043 process (string): The nickname for the physics process that this dataset belongs to
0044 global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0045 cache_location (string): The location of the local file cache
0046 use_cache (boolean): If true, access files from cache_location instead of global_file_prefix in jobs
0047 """
0048 self.name = name
0049 self.process = process
0050 self.global_file_prefix = global_file_prefix
0051 self.cache_location = cache_location
0052 self.use_cache = use_cache
0053 self.tmpdir = tmpdir
0054 self.files = None
0055 self.max_files = None
0056
0057 def __repr__(self):
0058 """
0059
0060 Returns:
0061 string: The string representation of the Dataset
0062 """
0063 s = "Dataset(name={0})".format(self.name)
0064 return s
0065
0066 def escape_name(self):
0067 """Removes any slashes and other characters from the name such that it can be used as a filename
0068
0069 Returns:
0070 string: The DAS name usable as a filename
0071 """
0072 name = self.name.replace("/", "__")
0073 if name.startswith("__"):
0074 name = name[2:]
0075 return name
0076
0077 def get_das_cache_filename(self):
0078 """Summary
0079
0080 Returns:
0081 TYPE: Description
0082 """
0083
0084 return os.path.join(self.tmpdir, "das_cache", self.process + ".txt")
0085
0086
0087 def get_filenames(self):
0088 """Summary
0089
0090 Args:
0091 njob (TYPE): Description
0092
0093 Returns:
0094 TYPE: Description
0095 """
0096 ret = None
0097 with open(self.get_das_cache_filename(), "r") as fi:
0098 ret = [self.global_file_prefix + li.strip() for li in fi.readlines()]
0099 return ret
0100
0101 def cache_das_filenames(self):
0102 """Summary
0103
0104 Returns:
0105 TYPE: Description
0106 """
0107 LOG_MODULE_NAME.info("caching dataset {0}".format(self.name))
0108 ret = subprocess.check_output('dasgoclient --query="file dataset={0}" --limit=0'.format(self.name), shell=True)
0109
0110 target_dir = os.path.dirname(self.get_das_cache_filename())
0111 if not os.path.exists(target_dir):
0112 os.makedirs(target_dir)
0113
0114 nfiles = 0
0115 with open(self.get_das_cache_filename(), "w") as fi:
0116 for line in ret.decode().split("\n"):
0117 if line.endswith(".root"):
0118 fi.write(self.global_file_prefix + line + "\n")
0119 nfiles += 1
0120
0121 LOG_MODULE_NAME.info("retrieved {0} files from DAS".format(nfiles))
0122
0123 return
0124
0125 if __name__ == "__main__":
0126
0127
0128 prefix = "root://cmsxrootd.fnal.gov//"
0129
0130 tmpdir = "tmp"
0131 datasets = [
0132 Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_13_3_0_pre3-132X_mcRun3_2023_realistic_v4-v1/GEN-SIM-DIGI-RAW", "QCD_noPU", prefix, None, False, tmpdir),
0133 Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "QCD_PU", prefix, None, False, tmpdir),
0134 Dataset("/RelValZEE_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZEE_PU", prefix, None, False, tmpdir),
0135 Dataset("/RelValZMM_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZMM_PU", prefix, None, False, tmpdir),
0136 Dataset("/RelValTenTau_15_500/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "TenTau_PU", prefix, None, False, tmpdir),
0137 Dataset("/RelValNuGun/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "NuGun_PU", prefix, None, False, tmpdir)]
0138 for ds in datasets:
0139 ds.cache_das_filenames()
0140
0141
0142