Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:13:50

0001 #!/usr/bin/env python3
0002 
0003 from __future__ import print_function
0004 import logging
0005 import argparse
0006 import math
0007 import glob
0008 import sys
0009 import os
0010 import re
0011 
0012 import addLHEnumbers
0013 
0014 
0015 class BaseLHEMerger(object):
0016     """Base class of the LHE merge schemes"""
0017 
0018     def __init__(self, input_files, output_file):
0019         self.input_files = input_files
0020         self.output_file = output_file
0021 
0022     def merge(self):
0023         """Output the merged LHE"""
0024         pass
0025 
0026 class DefaultLHEMerger(BaseLHEMerger):
0027     """Default LHE merge scheme that copies the header of the first LHE file,
0028     merges and outputs the init block, then concatenates all event blocks."""
0029 
0030     def __init__(self, input_files, output_file, **kwargs):
0031         super(DefaultLHEMerger, self).__init__(input_files, output_file)
0032 
0033         self.bypass_check = kwargs.get('bypass_check', False)
0034         # line-by-line iterator for each input file
0035         self._f = [self.file_iterator(name) for name in self.input_files]
0036         self._header_str = []
0037         self._is_mglo = False
0038         self._xsec_combined = 0.
0039         self._uwgt = 0.
0040         self._init_str = [] # initiated blocks for each input file
0041         self._nevent = [] # number of events for each input file
0042 
0043     def file_iterator(self, path):
0044         """Line-by-line iterator of a txt file"""
0045         with open(path, 'r') as f:
0046             for line in f:
0047                 yield line
0048 
0049     def check_header_compatibility(self):
0050         """Check if all headers for input files are consistent."""
0051 
0052         if self.bypass_check:
0053             return
0054 
0055         inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
0056                                   "Use -b/--bypass-check to bypass the check.")
0057         allow_diff_keys = [
0058             'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
0059             'Number of Events', 'Integrated weight'
0060             ]
0061         self._header_lines = [header.split('\n') for header in self._header_str]
0062 
0063         # Iterate over header lines for all input files and check consistency
0064         logging.debug('header line number: %s' \
0065             % ', '.join([str(len(lines)) for lines in self._header_lines]))
0066         assert all([
0067             len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
0068             ), inconsistent_error_info % "line number does not match"
0069         inconsistent_lines_set = [set() for _ in self._header_lines]
0070         for line_zip in zip(*self._header_lines):
0071             if any([k in line_zip[0] for k in allow_diff_keys]):
0072                 logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
0073                 continue
0074             if not all([line_zip[0] == line for line in line_zip]):
0075                 # Ok so meet inconsistency in some lines, then temporarily store them
0076                 for i, line in enumerate(line_zip):
0077                     inconsistent_lines_set[i].add(line)
0078         # Those inconsistent lines still match, meaning that it is only a change of order
0079         assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
0080             inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
0081 
0082     def merge_headers(self):
0083         """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
0084 
0085         self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
0086         if self._is_mglo and not self.bypass_check:
0087             # Special handling of MadGraph5 LO LHEs
0088             match_geninfo = [
0089                 re.search(
0090                     (r"\<MGGenerationInfo\>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
0091                     r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+\<\/MGGenerationInfo\>"),
0092                     header
0093                     ) for header in self._header_str
0094                 ]
0095             self._xsec_combined = sum(
0096                 [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
0097                 ) / sum(self._nevent)
0098             geninfo_combined = ("<MGGenerationInfo>\n"
0099                                 "#  Number of Events        : %d\n"
0100                                 "#  Integrated weight (pb)  : %.10f\n</MGGenerationInfo>") \
0101                                     % (sum(self._nevent), self._xsec_combined)
0102             logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
0103             for info, nevt in zip(match_geninfo, self._nevent):
0104                 logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
0105             logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
0106                 % (sum(self._nevent), self._xsec_combined))
0107 
0108             header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
0109             return header_combined
0110 
0111         else:
0112             # No need to merge the headers
0113             return self._header_str[0]
0114 
0115     def merge_init_blocks(self):
0116         """If all <init> blocks are identical, return the same <init> block
0117         (in the case of Powheg LHEs); otherwise, calculate the output <init>
0118         blocks by merging the input blocks info using formula (same with the
0119         MG5LOLHEMerger scheme):
0120             XSECUP = sum(xsecup * no.events) / tot.events
0121             XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
0122             XMAXUP = max(xmaxup)
0123         """
0124 
0125         if self.bypass_check:
0126             # If bypass the consistency check, simply use the first LHE <init>
0127             # block as the output
0128             return self._init_str[0]
0129 
0130         # Initiate collected init block info. Will be in format of
0131         # {iprocess: [xsecup, xerrup, xmaxup]}
0132         new_init_block = {}
0133         old_init_block = [{} for _ in self._init_str]
0134 
0135         # Read the xsecup, xerrup, and xmaxup info from the <init> block for
0136         # all input LHEs
0137         for i, bl in enumerate(self._init_str): # loop over files
0138             nline = int(bl.split('\n')[0].strip().split()[-1])
0139 
0140             # loop over lines in <init> block
0141             for bl_line in bl.split('\n')[1:nline + 1]:
0142                 bl_line_sp = bl_line.split()
0143                 old_init_block[i][int(bl_line_sp[3])] = [
0144                     float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
0145 
0146             # After reading all subprocesses info, store the rest content in
0147             # <init> block for the first file
0148             if i == 0:
0149                 info_after_subprocess = bl.strip().split('\n')[nline + 1:]
0150 
0151             logging.info('Input file: %s' % self.input_files[i])
0152             for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
0153                 # reverse order: follow the MG5 custom
0154                 logging.info('  xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0155                     % tuple(old_init_block[i][ipr] + [ipr]))
0156 
0157         # Adopt smarter <init> block merging method
0158         # If all <init> blocks from input files are identical, return the same block;
0159         # otherwise combine them based on MG5LOLHEMerger scheme
0160         if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
0161             # All <init> blocks are identical
0162             logging.info(
0163                 'All input <init> blocks are identical. Output the same "<init> block.')
0164             return self._init_str[0]
0165 
0166         # Otherwise, calculate merged init block
0167         for i in range(len(self._f)):
0168             for ipr in old_init_block[i]:
0169                 # Initiate the subprocess for the new block if it is found for the
0170                 # first time in one input file
0171                 if ipr not in new_init_block:
0172                     new_init_block[ipr] = [0., 0., 0.]
0173                 new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i] # xsecup
0174                 new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2 # xerrup
0175                 new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2]) # xmaxup
0176         tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
0177 
0178         # Write first line of the <init> block (modify the nprocess at the last)
0179         self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
0180             + ' ' + str(len(new_init_block)) + '\n'
0181         # Form the merged init block
0182         logging.info('Output file: %s' % self.output_file)
0183         for ipr in sorted(list(new_init_block.keys()), reverse=True):
0184             # reverse order: follow the MG5 custom
0185             new_init_block[ipr][0] /= tot_nevent
0186             new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
0187             logging.info('  xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0188                 % tuple(new_init_block[ipr] + [ipr]))
0189             self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
0190         self._merged_init_str += '\n'.join(info_after_subprocess)
0191         if len(info_after_subprocess):
0192             self._merged_init_str += '\n'
0193 
0194         return self._merged_init_str
0195 
0196     def merge(self):
0197         with open(self.output_file, 'w') as fw:
0198             # Read the header for the all input files
0199             for i in range(len(self._f)):
0200                 header = []
0201                 line = next(self._f[i])
0202                 while not re.search('\s*<init(>|\s)', line):
0203                     header.append(line)
0204                     line = next(self._f[i])
0205                 # 'header' includes all contents before reaches <init>
0206                 self._header_str.append(''.join(header))
0207             self.check_header_compatibility()
0208 
0209             # Read <init> blocks for all input_files
0210             for i in range(len(self._f)):
0211                 init = []
0212                 line = next(self._f[i])
0213                 while not re.search('\s*</init>', line):
0214                     init.append(line)
0215                     line = next(self._f[i])
0216                 # 'init_str' includes all contents inside <init>...</init>
0217                 self._init_str.append(''.join(init))
0218 
0219             # Iterate over all events file-by-file and write events temporarily
0220             # to .tmp.lhe
0221             with open('.tmp.lhe', 'w') as _fwtmp:
0222                 for i in range(len(self._f)):
0223                     nevent = 0
0224                     while True:
0225                         line = next(self._f[i])
0226                         if re.search('\s*</event>', line):
0227                             nevent += 1
0228                         if re.search('\s*</LesHouchesEvents>', line):
0229                             break
0230                         _fwtmp.write(line)
0231                     self._nevent.append(nevent)
0232                     self._f[i].close()
0233 
0234             # Merge the header and init blocks and write to the output
0235             fw.write(self.merge_headers())
0236             fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
0237 
0238             # Write event blocks in .tmp.lhe back to the output
0239             # If is MG5 LO LHE, will recalculate the weights based on combined xsec
0240             # and nevent read from <MGGenerationInfo>, and the 'event_norm' mode
0241             if self._is_mglo and not self.bypass_check:
0242                 event_norm = re.search(
0243                     r'\s(\w+)\s*=\s*event_norm\s',
0244                     self._header_str[0]).group(1)
0245                 if event_norm == 'sum':
0246                     self._uwgt = self._xsec_combined / sum(self._nevent)
0247                 elif event_norm == 'average':
0248                     self._uwgt = self._xsec_combined
0249                 logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
0250                              "recalculate weights in each event block.\n"
0251                              "Unit weight: %+.7E") % (event_norm, self._uwgt))
0252 
0253                 # Modify event wgt when transfering .tmp.lhe to the output file
0254                 event_line = -999
0255                 with open('.tmp.lhe', 'r') as ftmp:
0256                     sign = lambda x: -1 if x < 0 else 1
0257                     for line in ftmp:
0258                         event_line += 1
0259                         if re.search('\s*<event.*>', line) and not re.search('\s*<event_num.*>', line):
0260                             event_line = 0
0261                         if event_line == 1:
0262                             # modify the XWGTUP appeared in the first line of the
0263                             # <event> block
0264                             orig_wgt = float(line.split()[2])
0265                             fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
0266                                 % (sign(orig_wgt) * self._uwgt), line))
0267                         elif re.search('\s*<wgt.*>.*</wgt>', line):
0268                             addi_wgt_str = re.search(r'\<wgt.*\>\s*(\S+)\s*\<\/wgt\>', line).group(1)
0269                             fw.write(line.replace(
0270                                 addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
0271                         else:
0272                             fw.write(line)
0273             else:
0274                 # Simply transfer all lines
0275                 with open('.tmp.lhe', 'r') as ftmp:
0276                     for line in ftmp:
0277                         fw.write(line)
0278             fw.write('</LesHouchesEvents>\n')
0279             os.remove('.tmp.lhe')
0280 
0281 
0282 class MG5LOLHEMerger(BaseLHEMerger):
0283     """Use the merger script dedicated for MG5 LO LHEs, as introduced in
0284     https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
0285     """
0286 
0287     def __init__(self, input_files, output_file, **kwargs):
0288         super(MG5LOLHEMerger, self).__init__(input_files, output_file)
0289         self._merger_script_url = \
0290             'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
0291 
0292     def merge(self):
0293         logging.info(
0294             ('Use the merger script in genproductions dedicated for '
0295             'MadGraph5-produced LHEs'))
0296         os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
0297             % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
0298         os.system('gzip -df %s.gz' % self.output_file)
0299         os.system('rm banner.txt')
0300 
0301 
0302 class ExternalCppLHEMerger(BaseLHEMerger):
0303     """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
0304     https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
0305     """
0306 
0307     def __init__(self, input_files, output_file, **kwargs):
0308         super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
0309         self._merger_script_url = \
0310             'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
0311 
0312     def merge(self):
0313         logging.info(
0314             ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
0315         os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
0316         with open('mergeLheFiles.cpp') as f:
0317             script_str = f.read()
0318         with open('mergeLheFiles.cpp', 'w') as fw:
0319             fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
0320         with open('input_files.txt', 'w') as fw:
0321             fw.write('\n'.join(self.input_files) + '\n')
0322 
0323         os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
0324         os.system('./mergeLheFiles input_files.txt')
0325         os.system('rm mergeLheFiles* input_files.txt')
0326 
0327 
0328 def main(argv = None):
0329     """Main routine of the script.
0330 
0331     Arguments:
0332     - `argv`: arguments passed to the main routine
0333     """
0334 
0335     if argv == None:
0336         argv = sys.argv[1:]
0337 
0338     parser = argparse.ArgumentParser(
0339         description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
0340                     "natural way.\n"
0341                     "A detailed description of the merging step (in the default mode):\n"
0342                     "  1. Header:\n"
0343                     "    a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
0344                     "    b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
0345                     "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
0346                     "  2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
0347                     "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
0348                     "each subprocess.\n"
0349                     "  3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
0350                     "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
0351                     "run card.\n"
0352                     "For further development of this script please always validate the merging result on the test "
0353                     "routines: https://github.com/colizz/mergelhe_validate\n"
0354                     "Example usage:\n"
0355                     "    mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
0356         formatter_class=argparse.RawTextHelpFormatter)
0357     parser.add_argument("-i", "--input-files", type=str,
0358                         help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
0359     parser.add_argument("-o", "--output-file",
0360                         default='output.lhe', type=str,
0361                         help="Output LHE file path.")
0362     parser.add_argument("--force-mglo-merger", action='store_true',
0363                         help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
0364                              "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
0365     parser.add_argument("--force-cpp-merger", action='store_true',
0366                         help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
0367                              "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
0368     parser.add_argument("-b", "--bypass-check", action='store_true',
0369                         help=("Bypass the compatibility check for the headers. If true, the header and init block "
0370                              "will be just a duplicate from the first input file, and events are concatenated without "
0371                              "modification."))
0372     parser.add_argument("-n", "--number-events", action='store_true',
0373                         help=("Add a tag to number each lhe event. Needed for Herwig to find correct lhe events"))
0374     parser.add_argument("--debug", action='store_true',
0375                         help="Use the debug mode.")
0376     args = parser.parse_args(argv)
0377 
0378     logging.basicConfig(
0379         format='[%(levelname)s] %(message)s',
0380         level=logging.INFO if not args.debug else DEBUG)
0381     logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
0382 
0383     # Extract input LHE files from the path
0384     assert len(args.input_files), \
0385         ('Please specify your input LHE files by -i/--input-files. '
0386         'Run \'mergeLHE.py -h\' for details.')
0387     input_files = [] # each individual input file
0388     for path in args.input_files.split(','):
0389         find_files = glob.glob(path)
0390         if len(find_files) == 0:
0391             logging.info('Warning: cannot find files in %s' % path)
0392         input_files += find_files
0393     input_files.sort()
0394     logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
0395     logging.info('>>> Write to output: %s ' % args.output_file)
0396 
0397     if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
0398         os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
0399 
0400     if args.number_events:
0401         offset = 0
0402         for input_file in input_files:
0403             offset += addLHEnumbers.number_events(input_file, offset=offset)
0404 
0405     # Check arguments
0406     assert len(input_files) > 0, 'Input LHE files should be more than 0.'
0407     if len(input_files) == 1:
0408         logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
0409         import shutil
0410         shutil.copy(input_files[0], args.output_file)
0411         return
0412     assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
0413         "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
0414 
0415     # Determine the merging scheme
0416     if args.force_mglo_merger:
0417         lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
0418     elif args.force_cpp_merger:
0419         lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
0420     else:
0421         lhe_merger = DefaultLHEMerger(
0422             input_files, args.output_file, bypass_check=args.bypass_check)
0423 
0424     # Do merging
0425     lhe_merger.merge()
0426 
0427 
0428 if __name__=="__main__":
0429     main()