1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
|
#!/usr/bin/env python3
""" Print the total number of events processed by the mille jobs per dataset
The information is taken from the `mps.db' file. Will group entries of the
same dataset and also datasets the script *thinks* belong to the same
data type, e.g. 0T cosmics. This is implemented very simple and should
always be checked by the user.
Usage:
`python mps_list_evts.py <mps.db file name>' or, after `scram b'
`mps_list_evts.py <mps.db file name>'
M. Schroeder, DESY Hamburg 26-May-2014
"""
import sys
mps_db = "mps.db" # the mps.db file, default value
def get_mille_lines():
""" Return list of mps.db lines that correspond to a mille job """
mille_lines = []
with open(mps_db,"r") as db:
for line in db:
line = line.rstrip('\n')
# mille and pede job lines have 13 `:' separated fields
parts = line.split(":")
if len(parts) == 13:
# mille lines start with `<123>:job<123>'
if parts[1] == "job"+parts[0]:
mille_lines.append(parts)
return mille_lines
def get_num_evts_per_dataset(mille_lines):
""" Return number of events per dataset
Returns a dict `<dataset>:<num_evts>', where <dataset> is the label
in the last field of the mille line.
"""
num_evts_per_dataset = {}
for line in mille_lines:
dataset = line[12]
num_evts = int(line[6])
if dataset in num_evts_per_dataset:
num_evts_per_dataset[dataset] = num_evts_per_dataset[dataset] + num_evts
else:
num_evts_per_dataset[dataset] = num_evts
return num_evts_per_dataset
def get_num_evts_per_merged_dataset(merged_datasets,num_evts_per_dataset):
""" Return number of events per merged dataset
Returns a dict `<merged_dataset>:<num_evts>'; see comments to function
`merge_datasets' for an explanation of <merged_dataset>.
"""
num_evts_per_merged_dataset = {}
for merged_dataset,datasets in merged_datasets.items():
num_evts = 0
for dataset in datasets:
num_evts = num_evts + num_evts_per_dataset[dataset]
num_evts_per_merged_dataset[merged_dataset] = num_evts
return num_evts_per_merged_dataset
def merge_datasets(num_evts_per_dataset):
""" Return dict `<merged_dataset> : list of <dataset>'
Associates all datasets in `num_evts_per_dataset' that belong by their
name to the same PD but to a different run era. For example:
isolated_mu_runa_v1, isolated_mu_runb_v1, isolated_mu_runc_v2 --> isolated_mu
The returned dict has as value a list of the merged datasets.
"""
datasets = num_evts_per_dataset.keys()
merged_datasets = {}
for dataset in datasets:
bare_name = dataset[0:dataset.find("run")].rstrip("_")
if bare_name in merged_datasets:
merged_datasets[bare_name].append(dataset)
else:
merged_datasets[bare_name] = [dataset]
return merged_datasets
def print_merging_scheme(merged_datasets):
""" Print number of events per merged dataset
See comments to function `merge_datasets' for an explanation
of what is meant by merged dataset.
"""
print("Defining the following merged datasets:")
for merged_dataset,datasets in merged_datasets.items():
print("\n `"+merged_dataset+"' from:")
for dataset in datasets:
print(" `"+dataset+"'")
def print_num_evts_per_dataset(num_evts_per_dataset):
""" Print number of events per dataset
See comments to function `get_num_evts_per_dataset' for an
explanation of what is meant by dataset.
"""
print("The following number of events per dataset have been processed:")
datasets = sorted(num_evts_per_dataset.keys())
max_name = 0
max_num = 0
for dataset in datasets:
if len(dataset) > max_name:
max_name = len(dataset)
if len(str(num_evts_per_dataset[dataset])) > max_num:
max_num = len(str(num_evts_per_dataset[dataset]))
expr_name = " {0: <"+str(max_name)+"}"
expr_num = " {0: >"+str(max_num)+"}"
for dataset in datasets:
print(expr_name.format(dataset)+" : "+expr_num.format(str(num_evts_per_dataset[dataset])))
if __name__ == '__main__':
""" main subroutine """
if len(sys.argv) < 2:
print('ERROR')
print('usage:')
print(' python mps_list_evts.py <mps.db file name> or, after scram b')
print(' mps_list_evts.py <mps.db file name>')
sys.exit(1)
mps_db = sys.argv[1]
print('Parsing '+mps_db)
mille_lines = get_mille_lines()
num_evts_per_dataset = get_num_evts_per_dataset(mille_lines)
merged_datasets = merge_datasets(num_evts_per_dataset)
num_evts_per_merged_dataset = get_num_evts_per_merged_dataset(merged_datasets,num_evts_per_dataset)
print("\n")
print_num_evts_per_dataset(num_evts_per_dataset)
print("\n\n")
print_merging_scheme(merged_datasets)
print("\n\n")
print_num_evts_per_dataset(num_evts_per_merged_dataset)
|