Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-26 02:34:38

0001 #!/usr/bin/env python3
0002 
0003 #Hack to get ROOT to ignore command line arguments that we want
0004 #to pass to Python
0005 
0006 def import_ROOT():
0007     import sys
0008     tmpargv = sys.argv
0009     sys.argv = ['-b', '-n']
0010     import ROOT
0011     sys.argv[:] = tmpargv[:]
0012     return ROOT
0013 
0014 import yaml
0015 import subprocess
0016 import logging
0017 import json
0018 import argparse
0019 import glob
0020 import multiprocessing
0021 import optparse
0022 import shlex
0023 import os
0024 
0025 LOG_MODULE_NAME = logging.getLogger(__name__)
0026 
0027 class Dataset:
0028 
0029     """Datatype that represents a DAS dataset
0030 
0031     Attributes:
0032         global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0033         name (string): The DAS name of the dataset
0034         process (string): The nickname for the physics process that this dataset belongs to
0035     """
0036 
0037     def __init__(self, name, process, global_file_prefix, cache_location, use_cache, tmpdir):
0038         """Summary
0039 
0040         Args:
0041             name (string): The DAS name of the dataset
0042             process (string): The nickname for the physics process that this dataset belongs to
0043             global_file_prefix (string): The ROOT TFile prefix that allows to open an LFN (/store/...)
0044             cache_location (string): The location of the local file cache
0045             use_cache (boolean): If true, access files from cache_location instead of global_file_prefix in jobs
0046         """
0047         self.name = name
0048         self.process = process
0049         self.global_file_prefix = global_file_prefix
0050         self.cache_location = cache_location
0051         self.use_cache = use_cache
0052         self.tmpdir = tmpdir
0053         self.files = None
0054         self.max_files = None
0055 
0056     def __repr__(self):
0057         """
0058 
0059         Returns:
0060             string: The string representation of the Dataset
0061         """
0062         s = "Dataset(name={0})".format(self.name)
0063         return s
0064 
0065     def escape_name(self):
0066         """Removes any slashes and other characters from the name such that it can be used as a filename
0067 
0068         Returns:
0069             string: The DAS name usable as a filename
0070         """
0071         name = self.name.replace("/", "__")
0072         if name.startswith("__"):
0073             name = name[2:]
0074         return name
0075 
0076     def get_das_cache_filename(self):
0077         """Summary
0078 
0079         Returns:
0080             TYPE: Description
0081         """
0082 
0083         return os.path.join(self.tmpdir, "das_cache", self.process + ".txt")
0084         #return os.path.join(self.tmpdir, "das_cache", self.process + ".txt", self.escape_name() + ".txt")
0085 
0086     def get_filenames(self):
0087         """Summary
0088 
0089         Args:
0090             njob (TYPE): Description
0091 
0092         Returns:
0093             TYPE: Description
0094         """
0095         ret = None
0096         with open(self.get_das_cache_filename(), "r") as fi:
0097             ret = [self.global_file_prefix + li.strip() for li in fi.readlines()]
0098         return ret
0099 
0100     def cache_das_filenames(self):
0101         """Summary
0102 
0103         Returns:
0104             TYPE: Description
0105         """
0106         LOG_MODULE_NAME.info("caching dataset {0}".format(self.name))
0107         ret = subprocess.check_output('dasgoclient --query="file dataset={0}" --limit=0'.format(self.name), shell=True)
0108 
0109         target_dir = os.path.dirname(self.get_das_cache_filename())
0110         if not os.path.exists(target_dir):
0111             os.makedirs(target_dir)
0112 
0113         nfiles = 0
0114         with open(self.get_das_cache_filename(), "w") as fi:
0115             for line in ret.decode().split("\n"):
0116                 if line.endswith(".root"):
0117                     fi.write(self.global_file_prefix + line + "\n")
0118                     nfiles += 1
0119 
0120         LOG_MODULE_NAME.info("retrieved {0} files from DAS".format(nfiles))
0121 
0122         return
0123 
0124 if __name__ == "__main__":
0125 
0126     #prefix = ""
0127     prefix = "root://cmsxrootd.fnal.gov//"
0128     #prefix = "root://xrootd-cms.infn.it//"
0129     tmpdir = "tmp"
0130     datasets = [
0131         Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_13_3_0_pre3-132X_mcRun3_2023_realistic_v4-v1/GEN-SIM-DIGI-RAW", "QCD_noPU", prefix, None, False, tmpdir),
0132         Dataset("/RelValQCD_FlatPt_15_3000HS_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "QCD_PU", prefix, None, False, tmpdir),
0133         Dataset("/RelValZEE_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZEE_PU", prefix, None, False, tmpdir),
0134         Dataset("/RelValZMM_14/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "ZMM_PU", prefix, None, False, tmpdir),
0135         Dataset("/RelValTenTau_15_500/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "TenTau_PU", prefix, None, False, tmpdir),
0136         Dataset("/RelValNuGun/CMSSW_12_1_0_pre2-PU_121X_mcRun3_2021_realistic_v1-v1/GEN-SIM-DIGI-RAW", "NuGun_PU", prefix, None, False, tmpdir)]
0137     for ds in datasets:
0138         ds.cache_das_filenames()
0139 
0140 
0141