Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-11-25 02:29:03

0001 #!/usr/bin/env python3
0002 
0003 """ Print the total number of events processed by the mille jobs per dataset
0004 
0005 The information is taken from the `mps.db' file. Will group entries of the
0006 same dataset and also datasets the script *thinks* belong to the same
0007 data type, e.g. 0T cosmics. This is implemented very simple and should 
0008 always be checked by the user.
0009 
0010 Usage:
0011 
0012  `python mps_list_evts.py <mps.db file name>' or, after `scram b'
0013  `mps_list_evts.py <mps.db file name>'
0014 
0015 M. Schroeder, DESY Hamburg      26-May-2014
0016 """
0017 
0018 import sys
0019 
0020 
0021 mps_db = "mps.db"               # the mps.db file, default value
0022 
0023 
0024 def get_mille_lines():
0025     """ Return list of mps.db lines that correspond to a mille job """
0026     mille_lines = []
0027     with open(mps_db,"r") as db:
0028         for line in db:
0029             line = line.rstrip('\n')
0030             # mille and pede job lines have 13 `:' separated fields
0031             parts = line.split(":")
0032             if len(parts) == 13:
0033                 # mille lines start with `<123>:job<123>'
0034                 if parts[1] == "job"+parts[0]:
0035                     mille_lines.append(parts)
0036 
0037     return mille_lines
0038 
0039 
0040 
0041 def get_num_evts_per_dataset(mille_lines):
0042     """ Return number of events per dataset
0043 
0044     Returns a dict `<dataset>:<num_evts>', where <dataset> is the label
0045     in the last field of the mille line.
0046     """
0047     num_evts_per_dataset = {}
0048     for line in mille_lines:
0049         dataset = line[12]
0050         num_evts = int(line[6])
0051         if dataset in num_evts_per_dataset:
0052             num_evts_per_dataset[dataset] = num_evts_per_dataset[dataset] + num_evts
0053         else:
0054             num_evts_per_dataset[dataset] = num_evts
0055 
0056     return num_evts_per_dataset
0057 
0058 
0059 
0060 def get_num_evts_per_merged_dataset(merged_datasets,num_evts_per_dataset):
0061     """ Return number of events per merged dataset
0062 
0063     Returns a dict `<merged_dataset>:<num_evts>'; see comments to function
0064     `merge_datasets' for an explanation of <merged_dataset>.
0065     """
0066     num_evts_per_merged_dataset = {}
0067     for merged_dataset,datasets in merged_datasets.items():
0068         num_evts = 0
0069         for dataset in datasets:
0070             num_evts = num_evts + num_evts_per_dataset[dataset]
0071         num_evts_per_merged_dataset[merged_dataset] = num_evts
0072 
0073     return num_evts_per_merged_dataset
0074 
0075 
0076 
0077 def merge_datasets(num_evts_per_dataset):
0078     """ Return dict `<merged_dataset> : list of <dataset>'
0079 
0080     Associates all datasets in `num_evts_per_dataset' that belong by their
0081     name to the same PD but to a different run era. For example:
0082     
0083     isolated_mu_runa_v1, isolated_mu_runb_v1, isolated_mu_runc_v2 --> isolated_mu
0084 
0085     The returned dict has as value a list of the merged datasets.
0086     """
0087     datasets = num_evts_per_dataset.keys()
0088     merged_datasets = {}
0089     for dataset in datasets:
0090         bare_name = dataset[0:dataset.find("run")].rstrip("_")
0091         if bare_name in merged_datasets:
0092             merged_datasets[bare_name].append(dataset)
0093         else:
0094             merged_datasets[bare_name] = [dataset]
0095 
0096     return merged_datasets
0097 
0098 
0099 
0100 def print_merging_scheme(merged_datasets):
0101     """ Print number of events per merged dataset
0102 
0103     See comments to function `merge_datasets' for an explanation
0104     of what is meant by merged dataset.
0105     """
0106     print("Defining the following merged datasets:")
0107     for merged_dataset,datasets in merged_datasets.items():
0108         print("\n  `"+merged_dataset+"' from:")
0109         for dataset in datasets:
0110             print("    `"+dataset+"'")
0111 
0112 
0113 
0114 def print_num_evts_per_dataset(num_evts_per_dataset):
0115     """ Print number of events per dataset
0116 
0117     See comments to function `get_num_evts_per_dataset' for an
0118     explanation of what is meant by dataset.
0119     """
0120     print("The following number of events per dataset have been processed:")
0121     datasets = sorted(num_evts_per_dataset.keys())
0122     max_name = 0
0123     max_num = 0
0124     for dataset in datasets:
0125         if len(dataset) > max_name:
0126             max_name = len(dataset)
0127         if len(str(num_evts_per_dataset[dataset])) > max_num:
0128             max_num = len(str(num_evts_per_dataset[dataset]))
0129     expr_name = " {0: <"+str(max_name)+"}"
0130     expr_num = " {0: >"+str(max_num)+"}"
0131     for dataset in datasets:
0132         print(expr_name.format(dataset)+" : "+expr_num.format(str(num_evts_per_dataset[dataset])))
0133 
0134 
0135 if  __name__ == '__main__':
0136     """ main subroutine """
0137 
0138     if len(sys.argv) < 2:
0139         print('ERROR')
0140         print('usage:')
0141         print('  python mps_list_evts.py <mps.db file name>  or, after scram b')
0142         print('  mps_list_evts.py <mps.db file name>')
0143         sys.exit(1)
0144 
0145     mps_db = sys.argv[1]
0146     print('Parsing '+mps_db)
0147 
0148     mille_lines = get_mille_lines()
0149     num_evts_per_dataset = get_num_evts_per_dataset(mille_lines)
0150     merged_datasets = merge_datasets(num_evts_per_dataset)
0151     num_evts_per_merged_dataset = get_num_evts_per_merged_dataset(merged_datasets,num_evts_per_dataset)
0152     
0153     print("\n")
0154     print_num_evts_per_dataset(num_evts_per_dataset)
0155     print("\n\n")
0156     print_merging_scheme(merged_datasets)
0157     print("\n\n")
0158     print_num_evts_per_dataset(num_evts_per_merged_dataset)
0159 
0160