1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
|
#!/bin/env python3
import ROOT
ROOT.PyConfig.IgnoreCommandLineOptions = True
import os
import sys
import argparse
import numpy as np
from DQMServices.FileIO.blacklist import get_blacklist
import multiprocessing
def create_dif(base_file_path, pr_file_path, pr_number, test_number, cmssw_version, num_processes, output_dir_path):
base_file = ROOT.TFile(base_file_path, 'read')
ROOT.gROOT.GetListOfFiles().Remove(base_file)
pr_file = ROOT.TFile(pr_file_path, 'read')
ROOT.gROOT.GetListOfFiles().Remove(pr_file)
if base_file.IsOpen():
print('Baseline file successfully opened', file=sys.stderr)
else:
print('Unable to open base file', file=sys.stderr)
return
if pr_file.IsOpen():
print('PR file successfully opened', file=sys.stderr)
else:
print('Unable to open PR file', file=sys.stderr)
return
run_nr = get_run_nr(pr_file_path)
# Get list of paths (lists of directories)
base_flat_dict = flatten_file(base_file, run_nr)
pr_flat_dict = flatten_file(pr_file, run_nr)
# Paths that appear in both baseline and PR data. (Intersection)
shared_paths = list(set(pr_flat_dict).intersection(set(base_flat_dict)))
# Paths that appear only in PR data. (Except)
only_pr_paths = list(set(pr_flat_dict).difference(set(base_flat_dict)))
# Paths that appear only in baseline data. (Except)
only_base_paths = list(set(base_flat_dict).difference(set(pr_flat_dict)))
# Histograms pointed to by these paths will be written to baseline output
paths_to_save_in_base = []
# Histograms pointed to by these paths will be written to pr output
paths_to_save_in_pr = []
# Make comparison
if num_processes > 1:
print("starting comparison using %d process(es)" % num_processes)
manager = multiprocessing.Manager()
return_dict = manager.dict()
proc = []
iProc = 0
block = len(shared_paths)//num_processes
for i in range(num_processes):
p = multiprocessing.Process(target=compareMP, args=(shared_paths[i*block:(i+1)*block], pr_flat_dict, base_flat_dict, i, return_dict))
proc.append(p)
p.start()
iProc += 1
p = multiprocessing.Process(target=compareMP, args=(shared_paths[(i+1)*block:len(shared_paths)], pr_flat_dict, base_flat_dict, num_processes, return_dict))
proc.append(p)
p.start()
iProc += 1
for i in range(iProc):
proc[i].join()
paths_to_save_in_pr.extend(return_dict[i]['pr'])
paths_to_save_in_base.extend(return_dict[i]['base'])
paths_to_save_in_pr.sort()
paths_to_save_in_base.sort()
print("Done")
else:
compare(shared_paths, pr_flat_dict, base_flat_dict, paths_to_save_in_pr, paths_to_save_in_base)
# Collect paths that have to be written to baseline output file
for path in only_base_paths:
item = base_flat_dict[path]
if item == None:
continue
paths_to_save_in_base.append(path)
# Collect paths that have to be written to PR output file
for path in only_pr_paths:
item = pr_flat_dict[path]
if item == None:
continue
paths_to_save_in_pr.append(path)
base_output_filename = get_output_filename(pr_file_path, pr_number, test_number, cmssw_version, False)
pr_output_filename = get_output_filename(pr_file_path, pr_number, test_number, cmssw_version, True)
# Write baseline output
save_paths(base_flat_dict, paths_to_save_in_base, os.path.join(output_dir_path, 'base', base_output_filename))
# Write PR output
save_paths(pr_flat_dict, paths_to_save_in_pr, os.path.join(output_dir_path, 'pr', pr_output_filename))
pr_file.Close()
base_file.Close()
# Info about changed, added and removed elements
nr_of_changed_elements = len(set(paths_to_save_in_base).intersection(set(paths_to_save_in_pr)))
nr_of_removed_elements = len(paths_to_save_in_base) - nr_of_changed_elements
nr_of_added_elements = len(paths_to_save_in_pr) - nr_of_changed_elements
print('Base output file. PR output file. Changed elements, removed elements, added elements:')
print(base_output_filename)
print(pr_output_filename)
print('%s %s %s' % (nr_of_changed_elements, nr_of_removed_elements, nr_of_added_elements))
def compareMP(shared_paths, pr_flat_dict, base_flat_dict, iProc, return_dict):
# Prepare output dictionary
comparisons = {'pr': [], 'base': []}
# Collect paths that have to be written to both output files
for path in shared_paths:
pr_item = pr_flat_dict[path]
base_item = base_flat_dict[path]
if pr_item == None or base_item == None:
continue
are_different=False
if pr_item.InheritsFrom('TProfile2D') and base_item.InheritsFrom('TProfile2D'):
# Compare TProfile (content, entries and errors)
are_different = not compare_TProfile(pr_item, base_item)
elif pr_item.InheritsFrom('TProfile') and base_item.InheritsFrom('TProfile'):
# Compare TProfile (content, entries and errors)
are_different = not compare_TProfile(pr_item, base_item)
elif pr_item.InheritsFrom('TH1') and base_item.InheritsFrom('TH1'):
# Compare bin by bin
pr_array = np.array(pr_item)
base_array = np.array(base_item)
if pr_array.shape != base_array.shape or not np.allclose(pr_array, base_array, equal_nan=True):
are_different = True
else:
# Compare non histograms
if pr_item != base_item:
are_different = True
if are_different:
comparisons['pr'].append(path)
comparisons['base'].append(path)
return_dict[iProc] = comparisons
def compare(shared_paths, pr_flat_dict, base_flat_dict, paths_to_save_in_pr, paths_to_save_in_base):
# Collect paths that have to be written to both output files
for path in shared_paths:
pr_item = pr_flat_dict[path]
base_item = base_flat_dict[path]
if pr_item == None or base_item == None:
continue
are_different=False
if pr_item.InheritsFrom('TProfile2D') and base_item.InheritsFrom('TProfile2D'):
# Compare TProfile (content, entries and errors)
are_different = not compare_TProfile(pr_item, base_item)
elif pr_item.InheritsFrom('TProfile') and base_item.InheritsFrom('TProfile'):
# Compare TProfile (content, entries and errors)
are_different = not compare_TProfile(pr_item, base_item)
elif pr_item.InheritsFrom('TH1') and base_item.InheritsFrom('TH1'):
# Compare bin by bin
pr_array = np.array(pr_item)
base_array = np.array(base_item)
if pr_array.shape != base_array.shape or not np.allclose(pr_array, base_array, equal_nan=True):
are_different = True
else:
# Compare non histograms
if pr_item != base_item:
are_different = True
if are_different:
paths_to_save_in_pr.append(path)
paths_to_save_in_base.append(path)
# Returns False if different, True otherwise
def compare_TProfile(pr_item, base_item):
if pr_item.GetSize() != base_item.GetSize():
return False
for i in range(pr_item.GetSize()):
pr_bin_content = pr_item.GetBinContent(i)
base_bin_content = base_item.GetBinContent(i)
pr_bin_entries = pr_item.GetBinEntries(i)
base_bin_entries = base_item.GetBinEntries(i)
pr_bin_error = pr_item.GetBinError(i)
base_bin_error = base_item.GetBinError(i)
if not np.isclose(pr_bin_content, base_bin_content, equal_nan=True):
return False
if not np.isclose(pr_bin_entries, base_bin_entries, equal_nan=True):
return False
if not np.isclose(pr_bin_error, base_bin_error, equal_nan=True):
return False
return True
def flatten_file(file, run_nr):
result = {}
for key in file.GetListOfKeys():
try:
traverse_till_end(key.ReadObj(), [], result, run_nr)
except:
pass
return result
def traverse_till_end(node, dirs_list, result, run_nr):
new_dir_list = dirs_list + [get_node_name(node)]
if hasattr(node, 'GetListOfKeys'):
for key in node.GetListOfKeys():
traverse_till_end(key.ReadObj(), new_dir_list, result, run_nr)
else:
if not is_blacklisted(new_dir_list, run_nr):
path = tuple(new_dir_list)
result[path] = node
def get_node_name(node):
if node.InheritsFrom('TObjString'):
# Strip out just the name from a tag (<name>value</name>)
name = node.GetName().split('>')[0][1:]
return name + get_string_suffix()
else:
return node.GetName()
def get_string_suffix():
return '_string_monitor_element'
def is_blacklisted(dirs_list, run_nr):
# Copy the list
dirs_list = dirs_list[:]
# Remove string suffix
if dirs_list[-1].endswith(get_string_suffix()):
dirs_list[-1] = dirs_list[-1].replace(get_string_suffix(), '')
return tuple(dirs_list) in get_blacklist(run_nr)
def save_paths(flat_dict, paths, result_file_path):
if len(paths) == 0:
print('No differences were observed - output will not be written', file=sys.stderr)
return
# Make sure output dir exists
result_dir = os.path.dirname(result_file_path)
if not os.path.exists(result_dir):
os.makedirs(result_dir)
result_file = ROOT.TFile(result_file_path, 'recreate')
ROOT.gROOT.GetListOfFiles().Remove(result_file)
if not result_file.IsOpen():
print('Unable to open %s output file' % result_file_path, file=sys.stderr)
return
for path in paths:
save_to_file(flat_dict, path, result_file)
result_file.Close()
print('Output written to %s file' % result_file_path, file=sys.stderr)
# Saves file from flat_dict in the same dir of currently open file for writing
def save_to_file(flat_dict, path, output_file):
histogram = flat_dict[path]
current = output_file
# Last item is filename. No need to create dir for it
for directory in path[:-1]:
current = create_dir(current, directory)
current.cd()
histogram.Write()
# Create dir in root file if it doesn't exist
def create_dir(parent_dir, name):
dir = parent_dir.Get(name)
if not dir:
dir = parent_dir.mkdir(name)
return dir
def get_output_filename(input_file_path, pr_number, test_number, cmssw_version, isPr):
# Samples of correct output file format:
# DQM_V0001_R000320822__wf136_892_pr__CMSSW_10_4_0_pre3-PR25518-1234__DQMIO.root
# When run number is 1 we have to use RelVal naming pattern:
# DQM_V0002_R000000001__RelVal_wf136_892_pr__CMSSW_10_4_0_pre3-PR25518-1234__DQMIO.root
input_file_name = os.path.basename(input_file_path)
run = input_file_name.split('_')[2]
workflow = os.path.basename(os.path.dirname(input_file_path)).split('_')[0].replace('.', '_')
if not workflow:
workflow = 'Unknown'
relval_prefix = ''
if run == 'R000000001':
relval_prefix = 'RelVal_'
baseOrPr = 'base'
if isPr:
baseOrPr = 'pr'
return 'DQM_V0001_%s__%swf%s_%s__%s-PR%s-%s__DQMIO.root' % (run, relval_prefix, workflow, baseOrPr, cmssw_version, pr_number, test_number)
def get_run_nr(file_path):
return os.path.basename(file_path).split('_')[2].lstrip('R').lstrip('0')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="This tool compares DQM monitor elements found in base-file with the ones found in pr-file."
"Comparison is done bin by bin and output is written to a root file containing only the changes.")
parser.add_argument('-b', '--base-file', help='Baseline IB DQM root file', required=True)
parser.add_argument('-p', '--pr-file', help='PR DQM root file', required=True)
parser.add_argument('-n', '--pr-number', help='PR number under test', default='00001')
parser.add_argument('-t', '--test-number', help='Unique test number to distinguish different comparisons of the same PR.', default='1')
parser.add_argument('-r', '--release-format', help='Release format in this format: CMSSW_10_5_X_2019-02-17-0000', default=os.environ['CMSSW_VERSION'])
parser.add_argument('-j', '--num-processes', help='Number of processes forked to parallel process the comparison', default=1, type=int)
parser.add_argument('-o', '--output-dir', help='Comparison root files output directory', default='dqmHistoComparisonOutput')
args = parser.parse_args()
cmssw_version = '_'.join(args.release_format.split('_')[:4])
create_dif(args.base_file, args.pr_file, args.pr_number, args.test_number, cmssw_version, args.num_processes, args.output_dir)
|