Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:33:16

0001 #!/usr/bin/env python3
0002 from __future__ import print_function
0003 
0004 #Hack to get ROOT to ignore command line arguments that we want
0005 #to pass to Python
0006 
0007 def import_ROOT():
0008     import sys
0009     tmpargv = sys.argv
0010     sys.argv = ['-b', '-n']
0011     import ROOT
0012     sys.argv[:] = tmpargv[:]
0013     return ROOT
0014 
0015 import yaml
0016 import subprocess
0017 import logging
0018 import json
0019 import argparse
0020 import glob
0021 import multiprocessing
0022 import optparse
0023 import shlex
0024 import os
0025 
0026 LOG_MODULE_NAME = logging.getLogger(__name__)
0027 
0028 class Dataset:
0029 
0030     """Datatype that represents a DAS dataset
0031 
0032     Attributes:
0033         global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0034         name (string): The DAS name of the dataset
0035         process (string): The nickname for the physics process that this dataset belongs to
0036     """
0037 
0038     def __init__(self, name, process, global_file_prefix, cache_location, use_cache, tmpdir):
0039         """Summary
0040 
0041         Args:
0042             name (string): The DAS name of the dataset
0043             process (string): The nickname for the physics process that this dataset belongs to
0044             global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0045             cache_location (string): The location of the local file cache
0046             use_cache (boolean): If true, access files from cache_location instead of global_file_prefix in jobs
0047         """
0048         self.name = name
0049         self.process = process
0050         self.global_file_prefix = global_file_prefix
0051         self.cache_location = cache_location
0052         self.use_cache = use_cache
0053         self.tmpdir = tmpdir
0054         self.files = None
0055         self.max_files = None
0056 
0057     def __repr__(self):
0058         """
0059 
0060         Returns:
0061             string: The string representation of the Dataset
0062         """
0063         s = "Dataset(name={0})".format(self.name)
0064         return s
0065 
0066     def escape_name(self):
0067         """Removes any slashes and other characters from the name such that it can be used as a filename
0068 
0069         Returns:
0070             string: The DAS name usable as a filename
0071         """
0072         name = self.name.replace("/", "__")
0073         if name.startswith("__"):
0074             name = name[2:]
0075         return name
0076 
0077     def get_das_cache_filename(self):
0078         """Summary
0079 
0080         Returns:
0081             TYPE: Description
0082         """
0083 
0084         return os.path.join(self.tmpdir, "das_cache", self.process + ".txt")
0085         #return os.path.join(self.tmpdir, "das_cache", self.process + ".txt", self.escape_name() + ".txt")
0086 
0087     def get_filenames(self):
0088         """Summary
0089 
0090         Args:
0091             njob (TYPE): Description
0092 
0093         Returns:
0094             TYPE: Description
0095         """
0096         ret = None
0097         with open(self.get_das_cache_filename(), "r") as fi:
0098             ret = [self.global_file_prefix + li.strip() for li in fi.readlines()]
0099         return ret
0100 
0101     def cache_das_filenames(self):
0102         """Summary
0103 
0104         Returns:
0105             TYPE: Description
0106         """
0107         LOG_MODULE_NAME.info("caching dataset {0}".format(self.name))
0108         ret = subprocess.check_output('dasgoclient --query="file dataset={0}" --limit=0'.format(self.name), shell=True)
0109 
0110         target_dir = os.path.dirname(self.get_das_cache_filename())
0111         if not os.path.exists(target_dir):
0112             os.makedirs(target_dir)
0113 
0114         nfiles = 0
0115         with open(self.get_das_cache_filename(), "w") as fi:
0116             for line in ret.decode().split("\n"):
0117                 if line.endswith(".root"):
0118                     fi.write(self.global_file_prefix + line + "\n")
0119                     nfiles += 1
0120 
0121         LOG_MODULE_NAME.info("retrieved {0} files from DAS".format(nfiles))
0122 
0123         return
0124 
0125 if __name__ == "__main__":
0126 
0127     #prefix = ""
0128     prefix = "root://cmsxrootd.fnal.gov//"
0129     #prefix = "root://xrootd-cms.infn.it//"
0130     tmpdir = "tmp"
0131     datasets = [
0132         Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_13_3_0_pre3-132X_mcRun3_2023_realistic_v4-v1/GEN-SIM-DIGI-RAW", "QCD_noPU", prefix, None, False, tmpdir),
0133         Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "QCD_PU", prefix, None, False, tmpdir),
0134         Dataset("/RelValZEE_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZEE_PU", prefix, None, False, tmpdir),
0135         Dataset("/RelValZMM_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZMM_PU", prefix, None, False, tmpdir),
0136         Dataset("/RelValTenTau_15_500/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "TenTau_PU", prefix, None, False, tmpdir),
0137         Dataset("/RelValNuGun/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "NuGun_PU", prefix, None, False, tmpdir)]
0138     for ds in datasets:
0139         ds.cache_das_filenames()
0140 
0141 
0142