FileIO/scripts/compareHistograms.py

0001 #!/bin/env python3
0002
0003 import ROOT
0004 ROOT.PyConfig.IgnoreCommandLineOptions = True
0005 import os
0006 import sys
0007 import argparse
0008 import numpy as np
0009 from DQMServices.FileIO.blacklist import get_blacklist
0010 import multiprocessing
0011
0012 def create_dif(base_file_path, pr_file_path, pr_number, test_number, cmssw_version, num_processes, output_dir_path):
0013    base_file = ROOT.TFile(base_file_path, 'read')
0014    ROOT.gROOT.GetListOfFiles().Remove(base_file)
0015
0016    pr_file = ROOT.TFile(pr_file_path, 'read')
0017    ROOT.gROOT.GetListOfFiles().Remove(pr_file)
0018
0019    if base_file.IsOpen():
0020       print('Baseline file successfully opened', file=sys.stderr)
0021    else:
0022       print('Unable to open base file', file=sys.stderr)
0023       return
0024
0025    if pr_file.IsOpen():
0026       print('PR file successfully opened', file=sys.stderr)
0027    else:
0028       print('Unable to open PR file', file=sys.stderr)
0029       return
0030
0031    run_nr = get_run_nr(pr_file_path)
0032
0033    # Get list of paths (lists of directories)
0034    base_flat_dict = flatten_file(base_file, run_nr)
0035    pr_flat_dict = flatten_file(pr_file, run_nr)
0036
0037    # Paths that appear in both baseline and PR data. (Intersection)
0038    shared_paths = list(set(pr_flat_dict).intersection(set(base_flat_dict)))
0039
0040    # Paths that appear only in PR data. (Except)
0041    only_pr_paths = list(set(pr_flat_dict).difference(set(base_flat_dict)))
0042
0043    # Paths that appear only in baseline data. (Except)
0044    only_base_paths = list(set(base_flat_dict).difference(set(pr_flat_dict)))
0045
0046    # Histograms pointed to by these paths will be written to baseline output
0047    paths_to_save_in_base = []
0048
0049    # Histograms pointed to by these paths will be written to pr output
0050    paths_to_save_in_pr = []
0051
0052    # Make comparison
0053    if num_processes > 1:
0054        print("starting comparison using %d process(es)" % num_processes)
0055        manager = multiprocessing.Manager()
0056        return_dict = manager.dict()
0057        proc = []
0058        iProc = 0
0059
0060        block = len(shared_paths)//num_processes
0061        for i in range(num_processes):
0062            p = multiprocessing.Process(target=compareMP, args=(shared_paths[i*block:(i+1)*block], pr_flat_dict, base_flat_dict, i, return_dict))
0063            proc.append(p)
0064            p.start()
0065            iProc += 1
0066        p = multiprocessing.Process(target=compareMP, args=(shared_paths[(i+1)*block:len(shared_paths)], pr_flat_dict, base_flat_dict, num_processes, return_dict))
0067        proc.append(p)
0068        p.start()
0069        iProc += 1
0070
0071        for i in range(iProc):
0072            proc[i].join()
0073            paths_to_save_in_pr.extend(return_dict[i]['pr'])
0074            paths_to_save_in_base.extend(return_dict[i]['base'])
0075
0076        paths_to_save_in_pr.sort()
0077        paths_to_save_in_base.sort()
0078        print("Done")
0079    else:
0080        compare(shared_paths, pr_flat_dict, base_flat_dict, paths_to_save_in_pr, paths_to_save_in_base)
0081
0082    # Collect paths that have to be written to baseline output file
0083    for path in only_base_paths:
0084       item = base_flat_dict[path]
0085
0086       if item == None:
0087          continue
0088
0089       paths_to_save_in_base.append(path)
0090
0091    # Collect paths that have to be written to PR output file
0092    for path in only_pr_paths:
0093       item = pr_flat_dict[path]
0094
0095       if item == None:
0096          continue
0097
0098       paths_to_save_in_pr.append(path)
0099
0100    base_output_filename = get_output_filename(pr_file_path, pr_number, test_number, cmssw_version, False)
0101    pr_output_filename = get_output_filename(pr_file_path, pr_number, test_number, cmssw_version, True)
0102
0103    # Write baseline output
0104    save_paths(base_flat_dict, paths_to_save_in_base, os.path.join(output_dir_path, 'base', base_output_filename))
0105
0106    # Write PR output
0107    save_paths(pr_flat_dict, paths_to_save_in_pr, os.path.join(output_dir_path, 'pr', pr_output_filename))
0108
0109    pr_file.Close()
0110    base_file.Close()
0111
0112    # Info about changed, added and removed elements
0113    nr_of_changed_elements = len(set(paths_to_save_in_base).intersection(set(paths_to_save_in_pr)))
0114    nr_of_removed_elements = len(paths_to_save_in_base) - nr_of_changed_elements
0115    nr_of_added_elements = len(paths_to_save_in_pr) - nr_of_changed_elements
0116
0117    print('Base output file. PR output file. Changed elements, removed elements, added elements:')
0118    print(base_output_filename)
0119    print(pr_output_filename)
0120    print('%s %s %s' % (nr_of_changed_elements, nr_of_removed_elements, nr_of_added_elements))
0121
0122 def compareMP(shared_paths, pr_flat_dict, base_flat_dict, iProc, return_dict):
0123    # Prepare output dictionary
0124    comparisons = {'pr': [], 'base': []}
0125
0126    # Collect paths that have to be written to both output files
0127    for path in shared_paths:
0128       pr_item = pr_flat_dict[path]
0129       base_item = base_flat_dict[path]
0130
0131       if pr_item == None or base_item == None:
0132          continue
0133
0134       are_different=False
0135
0136       if pr_item.InheritsFrom('TProfile2D') and base_item.InheritsFrom('TProfile2D'):
0137          # Compare TProfile (content, entries and errors)
0138          are_different = not compare_TProfile(pr_item, base_item)
0139
0140       elif pr_item.InheritsFrom('TProfile') and base_item.InheritsFrom('TProfile'):
0141          # Compare TProfile (content, entries and errors)
0142          are_different = not compare_TProfile(pr_item, base_item)
0143
0144       elif pr_item.InheritsFrom('TH1') and base_item.InheritsFrom('TH1'):
0145          # Compare bin by bin
0146          pr_array = np.array(pr_item)
0147          base_array = np.array(base_item)
0148
0149          if pr_array.shape != base_array.shape or not np.allclose(pr_array, base_array, equal_nan=True):
0150             are_different = True
0151       else:
0152          # Compare non histograms
0153          if pr_item != base_item:
0154             are_different = True
0155
0156       if are_different:
0157          comparisons['pr'].append(path)
0158          comparisons['base'].append(path)
0159    return_dict[iProc] = comparisons
0160
0161 def compare(shared_paths, pr_flat_dict, base_flat_dict, paths_to_save_in_pr, paths_to_save_in_base):
0162    # Collect paths that have to be written to both output files
0163    for path in shared_paths:
0164       pr_item = pr_flat_dict[path]
0165       base_item = base_flat_dict[path]
0166
0167       if pr_item == None or base_item == None:
0168          continue
0169
0170       are_different=False
0171
0172       if pr_item.InheritsFrom('TProfile2D') and base_item.InheritsFrom('TProfile2D'):
0173          # Compare TProfile (content, entries and errors)
0174          are_different = not compare_TProfile(pr_item, base_item)
0175
0176       elif pr_item.InheritsFrom('TProfile') and base_item.InheritsFrom('TProfile'):
0177          # Compare TProfile (content, entries and errors)
0178          are_different = not compare_TProfile(pr_item, base_item)
0179
0180       elif pr_item.InheritsFrom('TH1') and base_item.InheritsFrom('TH1'):
0181          # Compare bin by bin
0182          pr_array = np.array(pr_item)
0183          base_array = np.array(base_item)
0184
0185          if pr_array.shape != base_array.shape or not np.allclose(pr_array, base_array, equal_nan=True):
0186             are_different = True
0187       else:
0188          # Compare non histograms
0189          if pr_item != base_item:
0190             are_different = True
0191
0192       if are_different:
0193          paths_to_save_in_pr.append(path)
0194          paths_to_save_in_base.append(path)
0195
0196 # Returns False if different, True otherwise
0197 def compare_TProfile(pr_item, base_item):
0198    if pr_item.GetSize() != base_item.GetSize():
0199       return False
0200
0201    for i in range(pr_item.GetSize()):
0202       pr_bin_content = pr_item.GetBinContent(i)
0203       base_bin_content = base_item.GetBinContent(i)
0204
0205       pr_bin_entries = pr_item.GetBinEntries(i)
0206       base_bin_entries = base_item.GetBinEntries(i)
0207
0208       pr_bin_error = pr_item.GetBinError(i)
0209       base_bin_error = base_item.GetBinError(i)
0210
0211       if not np.isclose(pr_bin_content, base_bin_content, equal_nan=True):
0212          return False
0213
0214       if not np.isclose(pr_bin_entries, base_bin_entries, equal_nan=True):
0215          return False
0216
0217       if not np.isclose(pr_bin_error, base_bin_error, equal_nan=True):
0218          return False
0219
0220    return True
0221
0222 def flatten_file(file, run_nr):
0223    result = {}
0224    for key in file.GetListOfKeys():
0225       try:
0226          traverse_till_end(key.ReadObj(), [], result, run_nr)
0227       except:
0228          pass
0229
0230    return result
0231
0232 def traverse_till_end(node, dirs_list, result, run_nr):
0233    new_dir_list = dirs_list + [get_node_name(node)]
0234    if hasattr(node, 'GetListOfKeys'):
0235       for key in node.GetListOfKeys():
0236          traverse_till_end(key.ReadObj(), new_dir_list, result, run_nr)
0237    else:
0238       if not is_blacklisted(new_dir_list, run_nr):
0239          path = tuple(new_dir_list)
0240          result[path] = node
0241
0242 def get_node_name(node):
0243    if node.InheritsFrom('TObjString'):
0244       # Strip out just the name from a tag (<name>value</name>)
0245       name = node.GetName().split('>')[0][1:]
0246       return name + get_string_suffix()
0247    else:
0248       return node.GetName()
0249
0250 def get_string_suffix():
0251    return '_string_monitor_element'
0252
0253 def is_blacklisted(dirs_list, run_nr):
0254    # Copy the list
0255    dirs_list = dirs_list[:]
0256    # Remove string suffix
0257    if dirs_list[-1].endswith(get_string_suffix()):
0258       dirs_list[-1] = dirs_list[-1].replace(get_string_suffix(), '')
0259
0260    return tuple(dirs_list) in get_blacklist(run_nr)
0261
0262 def save_paths(flat_dict, paths, result_file_path):
0263    if len(paths) == 0:
0264       print('No differences were observed - output will not be written', file=sys.stderr)
0265       return
0266
0267    # Make sure output dir exists
0268    result_dir = os.path.dirname(result_file_path)
0269    if not os.path.exists(result_dir):
0270       os.makedirs(result_dir)
0271
0272    result_file = ROOT.TFile(result_file_path, 'recreate')
0273    ROOT.gROOT.GetListOfFiles().Remove(result_file)
0274
0275    if not result_file.IsOpen():
0276       print('Unable to open %s output file' % result_file_path, file=sys.stderr)
0277       return
0278
0279    for path in paths:
0280       save_to_file(flat_dict, path, result_file)
0281
0282    result_file.Close()
0283    print('Output written to %s file' % result_file_path, file=sys.stderr)
0284
0285 # Saves file from flat_dict in the same dir of currently open file for writing
0286 def save_to_file(flat_dict, path, output_file):
0287    histogram = flat_dict[path]
0288
0289    current = output_file
0290
0291    # Last item is filename. No need to create dir for it
0292    for directory in path[:-1]:
0293       current = create_dir(current, directory)
0294       current.cd()
0295
0296    histogram.Write()
0297
0298 # Create dir in root file if it doesn't exist
0299 def create_dir(parent_dir, name):
0300    dir = parent_dir.Get(name)
0301    if not dir:
0302       dir = parent_dir.mkdir(name)
0303    return dir
0304
0305 def get_output_filename(input_file_path, pr_number, test_number, cmssw_version, isPr):
0306    # Samples of correct output file format:
0307    # DQM_V0001_R000320822__wf136_892_pr__CMSSW_10_4_0_pre3-PR25518-1234__DQMIO.root
0308    # When run number is 1 we have to use RelVal naming pattern:
0309    # DQM_V0002_R000000001__RelVal_wf136_892_pr__CMSSW_10_4_0_pre3-PR25518-1234__DQMIO.root
0310
0311    input_file_name = os.path.basename(input_file_path)
0312
0313    run = input_file_name.split('_')[2]
0314    workflow = os.path.basename(os.path.dirname(input_file_path)).split('_')[0].replace('.', '_')
0315    if not workflow:
0316       workflow = 'Unknown'
0317
0318    relval_prefix = ''
0319    if run == 'R000000001':
0320       relval_prefix = 'RelVal_'
0321
0322    baseOrPr = 'base'
0323    if isPr:
0324       baseOrPr = 'pr'
0325
0326    return 'DQM_V0001_%s__%swf%s_%s__%s-PR%s-%s__DQMIO.root' % (run, relval_prefix, workflow, baseOrPr, cmssw_version, pr_number, test_number)
0327
0328 def get_run_nr(file_path):
0329    return os.path.basename(file_path).split('_')[2].lstrip('R').lstrip('0')
0330
0331 if __name__ == '__main__':
0332    parser = argparse.ArgumentParser(description="This tool compares DQM monitor elements found in base-file with the ones found in pr-file."
0333       "Comparison is done bin by bin and output is written to a root file containing only the changes.")
0334    parser.add_argument('-b', '--base-file', help='Baseline IB DQM root file', required=True)
0335    parser.add_argument('-p', '--pr-file', help='PR DQM root file', required=True)
0336    parser.add_argument('-n', '--pr-number', help='PR number under test', default='00001')
0337    parser.add_argument('-t', '--test-number', help='Unique test number to distinguish different comparisons of the same PR.', default='1')
0338    parser.add_argument('-r', '--release-format', help='Release format in this format: CMSSW_10_5_X_2019-02-17-0000', default=os.environ['CMSSW_VERSION'])
0339    parser.add_argument('-j', '--num-processes', help='Number of processes forked to parallel process the comparison', default=1, type=int)
0340    parser.add_argument('-o', '--output-dir', help='Comparison root files output directory', default='dqmHistoComparisonOutput')
0341    args = parser.parse_args()
0342
0343    cmssw_version = '_'.join(args.release_format.split('_')[:4])
0344
0345    create_dif(args.base_file, args.pr_file, args.pr_number, args.test_number, cmssw_version, args.num_processes, args.output_dir)