File indexing completed on 2024-11-26 02:34:38
0001
0002
0003
0004
0005
0006 def import_ROOT():
0007 import sys
0008 tmpargv = sys.argv
0009 sys.argv = ['-b', '-n']
0010 import ROOT
0011 sys.argv[:] = tmpargv[:]
0012 return ROOT
0013
0014 import yaml
0015 import subprocess
0016 import logging
0017 import json
0018 import argparse
0019 import glob
0020 import multiprocessing
0021 import optparse
0022 import shlex
0023 import os
0024
0025 LOG_MODULE_NAME = logging.getLogger(__name__)
0026
0027 class Dataset:
0028
0029 """Datatype that represents a DAS dataset
0030
0031 Attributes:
0032 global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0033 name (string): The DAS name of the dataset
0034 process (string): The nickname for the physics process that this dataset belongs to
0035 """
0036
0037 def __init__(self, name, process, global_file_prefix, cache_location, use_cache, tmpdir):
0038 """Summary
0039
0040 Args:
0041 name (string): The DAS name of the dataset
0042 process (string): The nickname for the physics process that this dataset belongs to
0043 global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0044 cache_location (string): The location of the local file cache
0045 use_cache (boolean): If true, access files from cache_location instead of global_file_prefix in jobs
0046 """
0047 self.name = name
0048 self.process = process
0049 self.global_file_prefix = global_file_prefix
0050 self.cache_location = cache_location
0051 self.use_cache = use_cache
0052 self.tmpdir = tmpdir
0053 self.files = None
0054 self.max_files = None
0055
0056 def __repr__(self):
0057 """
0058
0059 Returns:
0060 string: The string representation of the Dataset
0061 """
0062 s = "Dataset(name={0})".format(self.name)
0063 return s
0064
0065 def escape_name(self):
0066 """Removes any slashes and other characters from the name such that it can be used as a filename
0067
0068 Returns:
0069 string: The DAS name usable as a filename
0070 """
0071 name = self.name.replace("/", "__")
0072 if name.startswith("__"):
0073 name = name[2:]
0074 return name
0075
0076 def get_das_cache_filename(self):
0077 """Summary
0078
0079 Returns:
0080 TYPE: Description
0081 """
0082
0083 return os.path.join(self.tmpdir, "das_cache", self.process + ".txt")
0084
0085
0086 def get_filenames(self):
0087 """Summary
0088
0089 Args:
0090 njob (TYPE): Description
0091
0092 Returns:
0093 TYPE: Description
0094 """
0095 ret = None
0096 with open(self.get_das_cache_filename(), "r") as fi:
0097 ret = [self.global_file_prefix + li.strip() for li in fi.readlines()]
0098 return ret
0099
0100 def cache_das_filenames(self):
0101 """Summary
0102
0103 Returns:
0104 TYPE: Description
0105 """
0106 LOG_MODULE_NAME.info("caching dataset {0}".format(self.name))
0107 ret = subprocess.check_output('dasgoclient --query="file dataset={0}" --limit=0'.format(self.name), shell=True)
0108
0109 target_dir = os.path.dirname(self.get_das_cache_filename())
0110 if not os.path.exists(target_dir):
0111 os.makedirs(target_dir)
0112
0113 nfiles = 0
0114 with open(self.get_das_cache_filename(), "w") as fi:
0115 for line in ret.decode().split("\n"):
0116 if line.endswith(".root"):
0117 fi.write(self.global_file_prefix + line + "\n")
0118 nfiles += 1
0119
0120 LOG_MODULE_NAME.info("retrieved {0} files from DAS".format(nfiles))
0121
0122 return
0123
0124 if __name__ == "__main__":
0125
0126
0127 prefix = "root://cmsxrootd.fnal.gov//"
0128
0129 tmpdir = "tmp"
0130 datasets = [
0131 Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_13_3_0_pre3-132X_mcRun3_2023_realistic_v4-v1/GEN-SIM-DIGI-RAW", "QCD_noPU", prefix, None, False, tmpdir),
0132 Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "QCD_PU", prefix, None, False, tmpdir),
0133 Dataset("/RelValZEE_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZEE_PU", prefix, None, False, tmpdir),
0134 Dataset("/RelValZMM_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZMM_PU", prefix, None, False, tmpdir),
0135 Dataset("/RelValTenTau_15_500/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "TenTau_PU", prefix, None, False, tmpdir),
0136 Dataset("/RelValNuGun/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "NuGun_PU", prefix, None, False, tmpdir)]
0137 for ds in datasets:
0138 ds.cache_das_filenames()
0139
0140
0141