Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-03-05 02:39:16

0001 #!/usr/bin/env python3
0002 
0003 from __future__ import print_function
0004 import logging
0005 import argparse
0006 import math
0007 import glob
0008 import sys
0009 import os
0010 import re
0011 
0012 class BaseLHEMerger(object):
0013     """Base class of the LHE merge schemes"""
0014 
0015     def __init__(self, input_files, output_file):
0016         self.input_files = input_files
0017         self.output_file = output_file
0018 
0019     def merge(self):
0020         """Output the merged LHE"""
0021         pass
0022 
0023 class DefaultLHEMerger(BaseLHEMerger):
0024     """Default LHE merge scheme that copies the header of the first LHE file,
0025     merges and outputs the init block, then concatenates all event blocks."""
0026 
0027     def __init__(self, input_files, output_file, **kwargs):
0028         super(DefaultLHEMerger, self).__init__(input_files, output_file)
0029 
0030         self.bypass_check = kwargs.get('bypass_check', False)
0031         # line-by-line iterator for each input file
0032         self._f = [self.file_iterator(name) for name in self.input_files]
0033         self._header_str = []
0034         self._is_mglo = False
0035         self._xsec_combined = 0.
0036         self._uwgt = 0.
0037         self._init_str = [] # initiated blocks for each input file
0038         self._nevent = [] # number of events for each input file
0039 
0040     def file_iterator(self, path):
0041         """Line-by-line iterator of a txt file"""
0042         with open(path, 'r') as f:
0043             for line in f:
0044                 yield line
0045 
0046     def check_header_compatibility(self):
0047         """Check if all headers for input files are consistent."""
0048 
0049         if self.bypass_check:
0050             return
0051 
0052         inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
0053                                   "Use -b/--bypass-check to bypass the check.")
0054         allow_diff_keys = [
0055             'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
0056             'Number of Events', 'Integrated weight'
0057             ]
0058         self._header_lines = [header.split('\n') for header in self._header_str]
0059 
0060         # Iterate over header lines for all input files and check consistency
0061         logging.debug('header line number: %s' \
0062             % ', '.join([str(len(lines)) for lines in self._header_lines]))
0063         assert all([
0064             len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
0065             ), inconsistent_error_info % "line number not matches"
0066         inconsistent_lines_set = [set() for _ in self._header_lines]
0067         for line_zip in zip(*self._header_lines):
0068             if any([k in line_zip[0] for k in allow_diff_keys]):
0069                 logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
0070                 continue
0071             if not all([line_zip[0] == line for line in line_zip]):
0072                 # Ok so meet inconsistency in some lines, then temporarily store them
0073                 for i, line in enumerate(line_zip):
0074                     inconsistent_lines_set[i].add(line)
0075         # Those inconsistent lines still match, meaning that it is only a change of order
0076         assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
0077             inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
0078 
0079     def merge_headers(self):
0080         """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
0081 
0082         self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
0083         if self._is_mglo and not self.bypass_check:
0084             # Special handling of MadGraph5 LO LHEs
0085             match_geninfo = [
0086                 re.search(
0087                     (r"\<MGGenerationInfo\>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
0088                     r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+\<\/MGGenerationInfo\>"),
0089                     header
0090                     ) for header in self._header_str
0091                 ]
0092             self._xsec_combined = sum(
0093                 [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
0094                 ) / sum(self._nevent)
0095             geninfo_combined = ("<MGGenerationInfo>\n"
0096                                 "#  Number of Events        : %d\n"
0097                                 "#  Integrated weight (pb)  : %.10f\n</MGGenerationInfo>") \
0098                                     % (sum(self._nevent), self._xsec_combined)
0099             logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
0100             for info, nevt in zip(match_geninfo, self._nevent):
0101                 logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
0102             logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
0103                 % (sum(self._nevent), self._xsec_combined))
0104 
0105             header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
0106             return header_combined
0107 
0108         else:
0109             # No need to merge the headers
0110             return self._header_str[0]
0111 
0112     def merge_init_blocks(self):
0113         """If all <init> blocks are identical, return the same <init> block
0114         (in the case of Powheg LHEs); otherwise, calculate the output <init>
0115         blocks by merging the input blocks info using formula (same with the
0116         MG5LOLHEMerger scheme):
0117             XSECUP = sum(xsecup * no.events) / tot.events
0118             XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
0119             XMAXUP = max(xmaxup)
0120         """
0121 
0122         if self.bypass_check:
0123             # If bypass the consistency check, simply use the first LHE <init>
0124             # block as the output
0125             return self._init_str[0]
0126 
0127         # Initiate collected init block info. Will be in format of
0128         # {iprocess: [xsecup, xerrup, xmaxup]}
0129         new_init_block = {}
0130         old_init_block = [{} for _ in self._init_str]
0131 
0132         # Read the xsecup, xerrup, and xmaxup info from the <init> block for
0133         # all input LHEs
0134         for i, bl in enumerate(self._init_str): # loop over files
0135             nline = int(bl.split('\n')[0].strip().split()[-1])
0136 
0137             # loop over lines in <init> block
0138             for bl_line in bl.split('\n')[1:nline + 1]:
0139                 bl_line_sp = bl_line.split()
0140                 old_init_block[i][int(bl_line_sp[3])] = [
0141                     float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
0142 
0143             # After reading all subprocesses info, store the rest content in
0144             # <init> block for the first file
0145             if i == 0:
0146                 info_after_subprocess = bl.strip().split('\n')[nline + 1:]
0147 
0148             logging.info('Input file: %s' % self.input_files[i])
0149             for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
0150                 # reverse order: follow the MG5 custom
0151                 logging.info('  xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0152                     % tuple(old_init_block[i][ipr] + [ipr]))
0153 
0154         # Adopt smarter <init> block merging method
0155         # If all <init> blocks from input files are identical, return the same block;
0156         # otherwise combine them based on MG5LOLHEMerger scheme
0157         if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
0158             # All <init> blocks are identical
0159             logging.info(
0160                 'All input <init> blocks are identical. Output the same "<init> block.')
0161             return self._init_str[0]
0162 
0163         # Otherwise, calculate merged init block
0164         for i in range(len(self._f)):
0165             for ipr in old_init_block[i]:
0166                 # Initiate the subprocess for the new block if it is found for the
0167                 # first time in one input file
0168                 if ipr not in new_init_block:
0169                     new_init_block[ipr] = [0., 0., 0.]
0170                 new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i] # xsecup
0171                 new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2 # xerrup
0172                 new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2]) # xmaxup
0173         tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
0174 
0175         # Write first line of the <init> block (modify the nprocess at the last)
0176         self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
0177             + ' ' + str(len(new_init_block)) + '\n'
0178         # Form the merged init block
0179         logging.info('Output file: %s' % self.output_file)
0180         for ipr in sorted(list(new_init_block.keys()), reverse=True):
0181             # reverse order: follow the MG5 custom
0182             new_init_block[ipr][0] /= tot_nevent
0183             new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
0184             logging.info('  xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0185                 % tuple(new_init_block[ipr] + [ipr]))
0186             self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
0187         self._merged_init_str += '\n'.join(info_after_subprocess)
0188         if len(info_after_subprocess):
0189             self._merged_init_str += '\n'
0190 
0191         return self._merged_init_str
0192 
0193     def merge(self):
0194         with open(self.output_file, 'w') as fw:
0195             # Read the header for the all input files
0196             for i in range(len(self._f)):
0197                 header = []
0198                 line = next(self._f[i])
0199                 while not re.search('\s*<init(>|\s)', line):
0200                     header.append(line)
0201                     line = next(self._f[i])
0202                 # 'header' includes all contents before reaches <init>
0203                 self._header_str.append(''.join(header))
0204             self.check_header_compatibility()
0205 
0206             # Read <init> blocks for all input_files
0207             for i in range(len(self._f)):
0208                 init = []
0209                 line = next(self._f[i])
0210                 while not re.search('\s*</init>', line):
0211                     init.append(line)
0212                     line = next(self._f[i])
0213                 # 'init_str' includes all contents inside <init>...</init>
0214                 self._init_str.append(''.join(init))
0215 
0216             # Iterate over all events file-by-file and write events temporarily
0217             # to .tmp.lhe
0218             with open('.tmp.lhe', 'w') as _fwtmp:
0219                 for i in range(len(self._f)):
0220                     nevent = 0
0221                     while True:
0222                         line = next(self._f[i])
0223                         if re.search('\s*</event>', line):
0224                             nevent += 1
0225                         if re.search('\s*</LesHouchesEvents>', line):
0226                             break
0227                         _fwtmp.write(line)
0228                     self._nevent.append(nevent)
0229                     self._f[i].close()
0230 
0231             # Merge the header and init blocks and write to the output
0232             fw.write(self.merge_headers())
0233             fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
0234 
0235             # Write event blocks in .tmp.lhe back to the output
0236             # If is MG5 LO LHE, will recalculate the weights based on combined xsec
0237             # and nevent read from <MGGenerationInfo>, and the 'event_norm' mode
0238             if self._is_mglo and not self.bypass_check:
0239                 event_norm = re.search(
0240                     r'\s(\w+)\s*=\s*event_norm\s',
0241                     self._header_str[0]).group(1)
0242                 if event_norm == 'sum':
0243                     self._uwgt = self._xsec_combined / sum(self._nevent)
0244                 elif event_norm == 'average':
0245                     self._uwgt = self._xsec_combined
0246                 logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
0247                              "recalculate weights in each event block.\n"
0248                              "Unit weight: %+.7E") % (event_norm, self._uwgt))
0249 
0250                 # Modify event wgt when transfering .tmp.lhe to the output file
0251                 event_line = -999
0252                 with open('.tmp.lhe', 'r') as ftmp:
0253                     sign = lambda x: -1 if x < 0 else 1
0254                     for line in ftmp:
0255                         event_line += 1
0256                         if re.search('\s*<event.*>', line):
0257                             event_line = 0
0258                         if event_line == 1:
0259                             # modify the XWGTUP appeared in the first line of the
0260                             # <event> block
0261                             orig_wgt = float(line.split()[2])
0262                             fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
0263                                 % (sign(orig_wgt) * self._uwgt), line))
0264                         elif re.search('\s*<wgt.*>.*</wgt>', line):
0265                             addi_wgt_str = re.search(r'\<wgt.*\>\s*(\S+)\s*\<\/wgt\>', line).group(1)
0266                             fw.write(line.replace(
0267                                 addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
0268                         else:
0269                             fw.write(line)
0270             else:
0271                 # Simply transfer all lines
0272                 with open('.tmp.lhe', 'r') as ftmp:
0273                     for line in ftmp:
0274                         fw.write(line)
0275             fw.write('</LesHouchesEvents>\n')
0276             os.remove('.tmp.lhe')
0277 
0278 
0279 class MG5LOLHEMerger(BaseLHEMerger):
0280     """Use the merger script dedicated for MG5 LO LHEs, as introduced in
0281     https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
0282     """
0283 
0284     def __init__(self, input_files, output_file, **kwargs):
0285         super(MG5LOLHEMerger, self).__init__(input_files, output_file)
0286         self._merger_script_url = \
0287             'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
0288 
0289     def merge(self):
0290         logging.info(
0291             ('Use the merger script in genproductions dedicated for '
0292             'MadGraph5-produced LHEs'))
0293         os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
0294             % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
0295         os.system('gzip -df %s.gz' % self.output_file)
0296         os.system('rm banner.txt')
0297 
0298 
0299 class ExternalCppLHEMerger(BaseLHEMerger):
0300     """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
0301     https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
0302     """
0303 
0304     def __init__(self, input_files, output_file, **kwargs):
0305         super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
0306         self._merger_script_url = \
0307             'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
0308 
0309     def merge(self):
0310         logging.info(
0311             ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
0312         os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
0313         with open('mergeLheFiles.cpp') as f:
0314             script_str = f.read()
0315         with open('mergeLheFiles.cpp', 'w') as fw:
0316             fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
0317         with open('input_files.txt', 'w') as fw:
0318             fw.write('\n'.join(self.input_files) + '\n')
0319 
0320         os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
0321         os.system('./mergeLheFiles input_files.txt')
0322         os.system('rm mergeLheFiles* input_files.txt')
0323 
0324 
0325 def main(argv = None):
0326     """Main routine of the script.
0327 
0328     Arguments:
0329     - `argv`: arguments passed to the main routine
0330     """
0331 
0332     if argv == None:
0333         argv = sys.argv[1:]
0334 
0335     parser = argparse.ArgumentParser(
0336         description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
0337                     "natural way.\n"
0338                     "A detailed description of the merging step (in the default mode):\n"
0339                     "  1. Header:\n"
0340                     "    a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
0341                     "    b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
0342                     "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
0343                     "  2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
0344                     "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
0345                     "each subprocess.\n"
0346                     "  3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
0347                     "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
0348                     "run card.\n"
0349                     "For further development of this script please always validate the merging result on the test "
0350                     "routines: https://github.com/colizz/mergelhe_validate\n"
0351                     "Example usage:\n"
0352                     "    mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
0353         formatter_class=argparse.RawTextHelpFormatter)
0354     parser.add_argument("-i", "--input-files", type=str,
0355                         help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
0356     parser.add_argument("-o", "--output-file",
0357                         default='output.lhe', type=str,
0358                         help="Output LHE file path.")
0359     parser.add_argument("--force-mglo-merger", action='store_true',
0360                         help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
0361                              "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
0362     parser.add_argument("--force-cpp-merger", action='store_true',
0363                         help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
0364                              "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
0365     parser.add_argument("-b", "--bypass-check", action='store_true',
0366                         help=("Bypass the compatibility check for the headers. If true, the header and init block "
0367                              "will be just a duplicate from the first input file, and events are concatenated without "
0368                              "modification."))
0369     parser.add_argument("--debug", action='store_true',
0370                         help="Use the debug mode.")
0371     args = parser.parse_args(argv)
0372 
0373     logging.basicConfig(
0374         format='[%(levelname)s] %(message)s',
0375         level=logging.INFO if not args.debug else DEBUG)
0376     logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
0377 
0378     # Extract input LHE files from the path
0379     assert len(args.input_files), \
0380         ('Please specify your input LHE files by -i/--input-files. '
0381         'Run \'mergeLHE.py -h\' for details.')
0382     input_files = [] # each individual input file
0383     for path in args.input_files.split(','):
0384         find_files = glob.glob(path)
0385         if len(find_files) == 0:
0386             logging.info('Warning: cannot find files in %s' % path)
0387         input_files += find_files
0388     input_files.sort()
0389     logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
0390     logging.info('>>> Write to output: %s ' % args.output_file)
0391 
0392     if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
0393         os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
0394 
0395     # Check arguments
0396     assert len(input_files) > 0, 'Input LHE files should be more than 0.'
0397     if len(input_files) == 1:
0398         logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
0399         import shutil
0400         shutil.copy(input_files[0], args.output_file)
0401         return
0402     assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
0403         "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
0404 
0405     # Determine the merging scheme
0406     if args.force_mglo_merger:
0407         lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
0408     elif args.force_cpp_merger:
0409         lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
0410     else:
0411         lhe_merger = DefaultLHEMerger(input_files, args.output_file, bypass_check=args.bypass_check)
0412 
0413     # Do merging
0414     lhe_merger.merge()
0415 
0416 
0417 if __name__=="__main__":
0418     main()