Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-12-01 23:40:22

0001 #!/usr/bin/env python3
0002 
0003 import logging
0004 import argparse
0005 import math
0006 import glob
0007 import sys
0008 import os
0009 import re
0010 
0011 import addLHEnumbers
0012 
0013 
0014 class BaseLHEMerger(object):
0015     """Base class of the LHE merge schemes"""
0016 
0017     def __init__(self, input_files, output_file):
0018         self.input_files = input_files
0019         self.output_file = output_file
0020 
0021     def merge(self):
0022         """Output the merged LHE"""
0023         pass
0024 
0025 class DefaultLHEMerger(BaseLHEMerger):
0026     """Default LHE merge scheme that copies the header of the first LHE file,
0027     merges and outputs the init block, then concatenates all event blocks."""
0028 
0029     def __init__(self, input_files, output_file, **kwargs):
0030         super(DefaultLHEMerger, self).__init__(input_files, output_file)
0031 
0032         self.bypass_check = kwargs.get('bypass_check', False)
0033         # line-by-line iterator for each input file
0034         self._f = [self.file_iterator(name) for name in self.input_files]
0035         self._header_str = []
0036         self._is_mglo = False
0037         self._xsec_combined = 0.
0038         self._uwgt = 0.
0039         self._init_str = [] # initiated blocks for each input file
0040         self._nevent = [] # number of events for each input file
0041 
0042     def file_iterator(self, path):
0043         """Line-by-line iterator of a txt file"""
0044         with open(path, 'r') as f:
0045             for line in f:
0046                 yield line
0047 
0048     def check_header_compatibility(self):
0049         """Check if all headers for input files are consistent."""
0050 
0051         if self.bypass_check:
0052             return
0053 
0054         inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
0055                                   "Use -b/--bypass-check to bypass the check.")
0056         allow_diff_keys = [
0057             'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
0058             'Number of Events', 'Integrated weight'
0059             ]
0060         self._header_lines = [header.split('\n') for header in self._header_str]
0061 
0062         # Iterate over header lines for all input files and check consistency
0063         logging.debug('header line number: %s' \
0064             % ', '.join([str(len(lines)) for lines in self._header_lines]))
0065         assert all([
0066             len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
0067             ), inconsistent_error_info % "line number does not match"
0068         inconsistent_lines_set = [set() for _ in self._header_lines]
0069         for line_zip in zip(*self._header_lines):
0070             if any([k in line_zip[0] for k in allow_diff_keys]):
0071                 logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
0072                 continue
0073             if not all([line_zip[0] == line for line in line_zip]):
0074                 # Ok so meet inconsistency in some lines, then temporarily store them
0075                 for i, line in enumerate(line_zip):
0076                     inconsistent_lines_set[i].add(line)
0077         # Those inconsistent lines still match, meaning that it is only a change of order
0078         assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
0079             inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
0080 
0081     def merge_headers(self):
0082         """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
0083 
0084         self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
0085         if self._is_mglo and not self.bypass_check:
0086             # Special handling of MadGraph5 LO LHEs
0087             match_geninfo = [
0088                 re.search(
0089                     (r"\<MGGenerationInfo\>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
0090                     r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+\<\/MGGenerationInfo\>"),
0091                     header
0092                     ) for header in self._header_str
0093                 ]
0094             self._xsec_combined = sum(
0095                 [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
0096                 ) / sum(self._nevent)
0097             geninfo_combined = ("<MGGenerationInfo>\n"
0098                                 "#  Number of Events        : %d\n"
0099                                 "#  Integrated weight (pb)  : %.10f\n</MGGenerationInfo>") \
0100                                     % (sum(self._nevent), self._xsec_combined)
0101             logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
0102             for info, nevt in zip(match_geninfo, self._nevent):
0103                 logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
0104             logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
0105                 % (sum(self._nevent), self._xsec_combined))
0106 
0107             header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
0108             return header_combined
0109 
0110         else:
0111             # No need to merge the headers
0112             return self._header_str[0]
0113 
0114     def merge_init_blocks(self):
0115         """If all <init> blocks are identical, return the same <init> block
0116         (in the case of Powheg LHEs); otherwise, calculate the output <init>
0117         blocks by merging the input blocks info using formula (same with the
0118         MG5LOLHEMerger scheme):
0119             XSECUP = sum(xsecup * no.events) / tot.events
0120             XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
0121             XMAXUP = max(xmaxup)
0122         """
0123 
0124         if self.bypass_check:
0125             # If bypass the consistency check, simply use the first LHE <init>
0126             # block as the output
0127             return self._init_str[0]
0128 
0129         # Initiate collected init block info. Will be in format of
0130         # {iprocess: [xsecup, xerrup, xmaxup]}
0131         new_init_block = {}
0132         old_init_block = [{} for _ in self._init_str]
0133 
0134         # Read the xsecup, xerrup, and xmaxup info from the <init> block for
0135         # all input LHEs
0136         for i, bl in enumerate(self._init_str): # loop over files
0137             nline = int(bl.split('\n')[0].strip().split()[-1])
0138 
0139             # loop over lines in <init> block
0140             for bl_line in bl.split('\n')[1:nline + 1]:
0141                 bl_line_sp = bl_line.split()
0142                 old_init_block[i][int(bl_line_sp[3])] = [
0143                     float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
0144 
0145             # After reading all subprocesses info, store the rest content in
0146             # <init> block for the first file
0147             if i == 0:
0148                 info_after_subprocess = bl.strip().split('\n')[nline + 1:]
0149 
0150             logging.info('Input file: %s' % self.input_files[i])
0151             for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
0152                 # reverse order: follow the MG5 custom
0153                 logging.info('  xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0154                     % tuple(old_init_block[i][ipr] + [ipr]))
0155 
0156         # Adopt smarter <init> block merging method
0157         # If all <init> blocks from input files are identical, return the same block;
0158         # otherwise combine them based on MG5LOLHEMerger scheme
0159         if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
0160             # All <init> blocks are identical
0161             logging.info(
0162                 'All input <init> blocks are identical. Output the same "<init> block.')
0163             return self._init_str[0]
0164 
0165         # Otherwise, calculate merged init block
0166         for i in range(len(self._f)):
0167             for ipr in old_init_block[i]:
0168                 # Initiate the subprocess for the new block if it is found for the
0169                 # first time in one input file
0170                 if ipr not in new_init_block:
0171                     new_init_block[ipr] = [0., 0., 0.]
0172                 new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i] # xsecup
0173                 new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2 # xerrup
0174                 new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2]) # xmaxup
0175         tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
0176 
0177         # Write first line of the <init> block (modify the nprocess at the last)
0178         self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
0179             + ' ' + str(len(new_init_block)) + '\n'
0180         # Form the merged init block
0181         logging.info('Output file: %s' % self.output_file)
0182         for ipr in sorted(list(new_init_block.keys()), reverse=True):
0183             # reverse order: follow the MG5 custom
0184             new_init_block[ipr][0] /= tot_nevent
0185             new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
0186             logging.info('  xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0187                 % tuple(new_init_block[ipr] + [ipr]))
0188             self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
0189         self._merged_init_str += '\n'.join(info_after_subprocess)
0190         if len(info_after_subprocess):
0191             self._merged_init_str += '\n'
0192 
0193         return self._merged_init_str
0194 
0195     def merge(self):
0196         with open(self.output_file, 'w') as fw:
0197             # Read the header for the all input files
0198             for i in range(len(self._f)):
0199                 header = []
0200                 line = next(self._f[i])
0201                 while not re.search('\s*<init(>|\s)', line):
0202                     header.append(line)
0203                     line = next(self._f[i])
0204                 # 'header' includes all contents before reaches <init>
0205                 self._header_str.append(''.join(header))
0206             self.check_header_compatibility()
0207 
0208             # Read <init> blocks for all input_files
0209             for i in range(len(self._f)):
0210                 init = []
0211                 line = next(self._f[i])
0212                 while not re.search('\s*</init>', line):
0213                     init.append(line)
0214                     line = next(self._f[i])
0215                 # 'init_str' includes all contents inside <init>...</init>
0216                 self._init_str.append(''.join(init))
0217 
0218             # Iterate over all events file-by-file and write events temporarily
0219             # to .tmp.lhe
0220             with open('.tmp.lhe', 'w') as _fwtmp:
0221                 for i in range(len(self._f)):
0222                     nevent = 0
0223                     while True:
0224                         line = next(self._f[i])
0225                         if re.search('\s*</event>', line):
0226                             nevent += 1
0227                         if re.search('\s*</LesHouchesEvents>', line):
0228                             break
0229                         _fwtmp.write(line)
0230                     self._nevent.append(nevent)
0231                     self._f[i].close()
0232 
0233             # Merge the header and init blocks and write to the output
0234             fw.write(self.merge_headers())
0235             fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
0236 
0237             # Write event blocks in .tmp.lhe back to the output
0238             # If is MG5 LO LHE, will recalculate the weights based on combined xsec
0239             # and nevent read from <MGGenerationInfo>, and the 'event_norm' mode
0240             if self._is_mglo and not self.bypass_check:
0241                 event_norm = re.search(
0242                     r'\s(\w+)\s*=\s*event_norm\s',
0243                     self._header_str[0]).group(1)
0244                 if event_norm == 'sum':
0245                     self._uwgt = self._xsec_combined / sum(self._nevent)
0246                 elif event_norm == 'average':
0247                     self._uwgt = self._xsec_combined
0248                 logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
0249                              "recalculate weights in each event block.\n"
0250                              "Unit weight: %+.7E") % (event_norm, self._uwgt))
0251 
0252                 # Modify event wgt when transfering .tmp.lhe to the output file
0253                 event_line = -999
0254                 with open('.tmp.lhe', 'r') as ftmp:
0255                     sign = lambda x: -1 if x < 0 else 1
0256                     for line in ftmp:
0257                         event_line += 1
0258                         if re.search('\s*<event.*>', line) and not re.search('\s*<event_num.*>', line):
0259                             event_line = 0
0260                         if event_line == 1:
0261                             # modify the XWGTUP appeared in the first line of the
0262                             # <event> block
0263                             orig_wgt = float(line.split()[2])
0264                             fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
0265                                 % (sign(orig_wgt) * self._uwgt), line))
0266                         elif re.search('\s*<wgt.*>.*</wgt>', line):
0267                             addi_wgt_str = re.search(r'\<wgt.*\>\s*(\S+)\s*\<\/wgt\>', line).group(1)
0268                             fw.write(line.replace(
0269                                 addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
0270                         else:
0271                             fw.write(line)
0272             else:
0273                 # Simply transfer all lines
0274                 with open('.tmp.lhe', 'r') as ftmp:
0275                     for line in ftmp:
0276                         fw.write(line)
0277             fw.write('</LesHouchesEvents>\n')
0278             os.remove('.tmp.lhe')
0279 
0280 
0281 class MG5LOLHEMerger(BaseLHEMerger):
0282     """Use the merger script dedicated for MG5 LO LHEs, as introduced in
0283     https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
0284     """
0285 
0286     def __init__(self, input_files, output_file, **kwargs):
0287         super(MG5LOLHEMerger, self).__init__(input_files, output_file)
0288         self._merger_script_url = \
0289             'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
0290 
0291     def merge(self):
0292         logging.info(
0293             ('Use the merger script in genproductions dedicated for '
0294             'MadGraph5-produced LHEs'))
0295         os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
0296             % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
0297         os.system('gzip -df %s.gz' % self.output_file)
0298         os.system('rm banner.txt')
0299 
0300 
0301 class ExternalCppLHEMerger(BaseLHEMerger):
0302     """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
0303     https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
0304     """
0305 
0306     def __init__(self, input_files, output_file, **kwargs):
0307         super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
0308         self._merger_script_url = \
0309             'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
0310 
0311     def merge(self):
0312         logging.info(
0313             ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
0314         os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
0315         with open('mergeLheFiles.cpp') as f:
0316             script_str = f.read()
0317         with open('mergeLheFiles.cpp', 'w') as fw:
0318             fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
0319         with open('input_files.txt', 'w') as fw:
0320             fw.write('\n'.join(self.input_files) + '\n')
0321 
0322         os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
0323         os.system('./mergeLheFiles input_files.txt')
0324         os.system('rm mergeLheFiles* input_files.txt')
0325 
0326 
0327 def main(argv = None):
0328     """Main routine of the script.
0329 
0330     Arguments:
0331     - `argv`: arguments passed to the main routine
0332     """
0333 
0334     if argv == None:
0335         argv = sys.argv[1:]
0336 
0337     parser = argparse.ArgumentParser(
0338         description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
0339                     "natural way.\n"
0340                     "A detailed description of the merging step (in the default mode):\n"
0341                     "  1. Header:\n"
0342                     "    a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
0343                     "    b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
0344                     "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
0345                     "  2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
0346                     "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
0347                     "each subprocess.\n"
0348                     "  3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
0349                     "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
0350                     "run card.\n"
0351                     "For further development of this script please always validate the merging result on the test "
0352                     "routines: https://github.com/colizz/mergelhe_validate\n"
0353                     "Example usage:\n"
0354                     "    mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
0355         formatter_class=argparse.RawTextHelpFormatter)
0356     parser.add_argument("-i", "--input-files", type=str,
0357                         help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
0358     parser.add_argument("-o", "--output-file",
0359                         default='output.lhe', type=str,
0360                         help="Output LHE file path.")
0361     parser.add_argument("--force-mglo-merger", action='store_true',
0362                         help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
0363                              "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
0364     parser.add_argument("--force-cpp-merger", action='store_true',
0365                         help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
0366                              "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
0367     parser.add_argument("-b", "--bypass-check", action='store_true',
0368                         help=("Bypass the compatibility check for the headers. If true, the header and init block "
0369                              "will be just a duplicate from the first input file, and events are concatenated without "
0370                              "modification."))
0371     parser.add_argument("-n", "--number-events", action='store_true',
0372                         help=("Add a tag to number each lhe event. Needed for Herwig to find correct lhe events"))
0373     parser.add_argument("--debug", action='store_true',
0374                         help="Use the debug mode.")
0375     args = parser.parse_args(argv)
0376 
0377     logging.basicConfig(
0378         format='[%(levelname)s] %(message)s',
0379         level=logging.INFO if not args.debug else DEBUG)
0380     logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
0381 
0382     # Extract input LHE files from the path
0383     assert len(args.input_files), \
0384         ('Please specify your input LHE files by -i/--input-files. '
0385         'Run \'mergeLHE.py -h\' for details.')
0386     input_files = [] # each individual input file
0387     for path in args.input_files.split(','):
0388         find_files = glob.glob(path)
0389         if len(find_files) == 0:
0390             logging.info('Warning: cannot find files in %s' % path)
0391         input_files += find_files
0392     input_files.sort()
0393     logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
0394     logging.info('>>> Write to output: %s ' % args.output_file)
0395 
0396     if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
0397         os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
0398 
0399     if args.number_events:
0400         offset = 0
0401         for input_file in input_files:
0402             offset += addLHEnumbers.number_events(input_file, offset=offset)
0403 
0404     # Check arguments
0405     assert len(input_files) > 0, 'Input LHE files should be more than 0.'
0406     if len(input_files) == 1:
0407         logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
0408         import shutil
0409         shutil.copy(input_files[0], args.output_file)
0410         return
0411     assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
0412         "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
0413 
0414     # Determine the merging scheme
0415     if args.force_mglo_merger:
0416         lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
0417     elif args.force_cpp_merger:
0418         lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
0419     else:
0420         lhe_merger = DefaultLHEMerger(
0421             input_files, args.output_file, bypass_check=args.bypass_check)
0422 
0423     # Do merging
0424     lhe_merger.merge()
0425 
0426 
0427 if __name__=="__main__":
0428     main()