Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 11:56:36

0001 #!/usr/bin/env python3
0002 
0003 """ Print the total number of events processed by the mille jobs per dataset
0004 
0005 The information is taken from the `mps.db' file. Will group entries of the
0006 same dataset and also datasets the script *thinks* belong to the same
0007 data type, e.g. 0T cosmics. This is implemented very simple and should 
0008 always be checked by the user.
0009 
0010 Usage:
0011 
0012  `python mps_list_evts.py <mps.db file name>' or, after `scram b'
0013  `mps_list_evts.py <mps.db file name>'
0014 
0015 M. Schroeder, DESY Hamburg      26-May-2014
0016 """
0017 from __future__ import print_function
0018 
0019 import sys
0020 
0021 
0022 mps_db = "mps.db"               # the mps.db file, default value
0023 
0024 
0025 def get_mille_lines():
0026     """ Return list of mps.db lines that correspond to a mille job """
0027     mille_lines = []
0028     with open(mps_db,"r") as db:
0029         for line in db:
0030             line = line.rstrip('\n')
0031             # mille and pede job lines have 13 `:' separated fields
0032             parts = line.split(":")
0033             if len(parts) == 13:
0034                 # mille lines start with `<123>:job<123>'
0035                 if parts[1] == "job"+parts[0]:
0036                     mille_lines.append(parts)
0037 
0038     return mille_lines
0039 
0040 
0041 
0042 def get_num_evts_per_dataset(mille_lines):
0043     """ Return number of events per dataset
0044 
0045     Returns a dict `<dataset>:<num_evts>', where <dataset> is the label
0046     in the last field of the mille line.
0047     """
0048     num_evts_per_dataset = {}
0049     for line in mille_lines:
0050         dataset = line[12]
0051         num_evts = int(line[6])
0052         if dataset in num_evts_per_dataset:
0053             num_evts_per_dataset[dataset] = num_evts_per_dataset[dataset] + num_evts
0054         else:
0055             num_evts_per_dataset[dataset] = num_evts
0056 
0057     return num_evts_per_dataset
0058 
0059 
0060 
0061 def get_num_evts_per_merged_dataset(merged_datasets,num_evts_per_dataset):
0062     """ Return number of events per merged dataset
0063 
0064     Returns a dict `<merged_dataset>:<num_evts>'; see comments to function
0065     `merge_datasets' for an explanation of <merged_dataset>.
0066     """
0067     num_evts_per_merged_dataset = {}
0068     for merged_dataset,datasets in merged_datasets.items():
0069         num_evts = 0
0070         for dataset in datasets:
0071             num_evts = num_evts + num_evts_per_dataset[dataset]
0072         num_evts_per_merged_dataset[merged_dataset] = num_evts
0073 
0074     return num_evts_per_merged_dataset
0075 
0076 
0077 
0078 def merge_datasets(num_evts_per_dataset):
0079     """ Return dict `<merged_dataset> : list of <dataset>'
0080 
0081     Associates all datasets in `num_evts_per_dataset' that belong by their
0082     name to the same PD but to a different run era. For example:
0083     
0084     isolated_mu_runa_v1, isolated_mu_runb_v1, isolated_mu_runc_v2 --> isolated_mu
0085 
0086     The returned dict has as value a list of the merged datasets.
0087     """
0088     datasets = num_evts_per_dataset.keys()
0089     merged_datasets = {}
0090     for dataset in datasets:
0091         bare_name = dataset[0:dataset.find("run")].rstrip("_")
0092         if bare_name in merged_datasets:
0093             merged_datasets[bare_name].append(dataset)
0094         else:
0095             merged_datasets[bare_name] = [dataset]
0096 
0097     return merged_datasets
0098 
0099 
0100 
0101 def print_merging_scheme(merged_datasets):
0102     """ Print number of events per merged dataset
0103 
0104     See comments to function `merge_datasets' for an explanation
0105     of what is meant by merged dataset.
0106     """
0107     print("Defining the following merged datasets:")
0108     for merged_dataset,datasets in merged_datasets.items():
0109         print("\n  `"+merged_dataset+"' from:")
0110         for dataset in datasets:
0111             print("    `"+dataset+"'")
0112 
0113 
0114 
0115 def print_num_evts_per_dataset(num_evts_per_dataset):
0116     """ Print number of events per dataset
0117 
0118     See comments to function `get_num_evts_per_dataset' for an
0119     explanation of what is meant by dataset.
0120     """
0121     print("The following number of events per dataset have been processed:")
0122     datasets = sorted(num_evts_per_dataset.keys())
0123     max_name = 0
0124     max_num = 0
0125     for dataset in datasets:
0126         if len(dataset) > max_name:
0127             max_name = len(dataset)
0128         if len(str(num_evts_per_dataset[dataset])) > max_num:
0129             max_num = len(str(num_evts_per_dataset[dataset]))
0130     expr_name = " {0: <"+str(max_name)+"}"
0131     expr_num = " {0: >"+str(max_num)+"}"
0132     for dataset in datasets:
0133         print(expr_name.format(dataset)+" : "+expr_num.format(str(num_evts_per_dataset[dataset])))
0134 
0135 
0136 if  __name__ == '__main__':
0137     """ main subroutine """
0138 
0139     if len(sys.argv) < 2:
0140         print('ERROR')
0141         print('usage:')
0142         print('  python mps_list_evts.py <mps.db file name>  or, after scram b')
0143         print('  mps_list_evts.py <mps.db file name>')
0144         sys.exit(1)
0145 
0146     mps_db = sys.argv[1]
0147     print('Parsing '+mps_db)
0148 
0149     mille_lines = get_mille_lines()
0150     num_evts_per_dataset = get_num_evts_per_dataset(mille_lines)
0151     merged_datasets = merge_datasets(num_evts_per_dataset)
0152     num_evts_per_merged_dataset = get_num_evts_per_merged_dataset(merged_datasets,num_evts_per_dataset)
0153     
0154     print("\n")
0155     print_num_evts_per_dataset(num_evts_per_dataset)
0156     print("\n\n")
0157     print_merging_scheme(merged_datasets)
0158     print("\n\n")
0159     print_num_evts_per_dataset(num_evts_per_merged_dataset)
0160 
0161