File indexing completed on 2024-12-01 23:40:22
0001
0002
0003 import logging
0004 import argparse
0005 import math
0006 import glob
0007 import sys
0008 import os
0009 import re
0010
0011 import addLHEnumbers
0012
0013
0014 class BaseLHEMerger(object):
0015 """Base class of the LHE merge schemes"""
0016
0017 def __init__(self, input_files, output_file):
0018 self.input_files = input_files
0019 self.output_file = output_file
0020
0021 def merge(self):
0022 """Output the merged LHE"""
0023 pass
0024
0025 class DefaultLHEMerger(BaseLHEMerger):
0026 """Default LHE merge scheme that copies the header of the first LHE file,
0027 merges and outputs the init block, then concatenates all event blocks."""
0028
0029 def __init__(self, input_files, output_file, **kwargs):
0030 super(DefaultLHEMerger, self).__init__(input_files, output_file)
0031
0032 self.bypass_check = kwargs.get('bypass_check', False)
0033
0034 self._f = [self.file_iterator(name) for name in self.input_files]
0035 self._header_str = []
0036 self._is_mglo = False
0037 self._xsec_combined = 0.
0038 self._uwgt = 0.
0039 self._init_str = []
0040 self._nevent = []
0041
0042 def file_iterator(self, path):
0043 """Line-by-line iterator of a txt file"""
0044 with open(path, 'r') as f:
0045 for line in f:
0046 yield line
0047
0048 def check_header_compatibility(self):
0049 """Check if all headers for input files are consistent."""
0050
0051 if self.bypass_check:
0052 return
0053
0054 inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
0055 "Use -b/--bypass-check to bypass the check.")
0056 allow_diff_keys = [
0057 'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
0058 'Number of Events', 'Integrated weight'
0059 ]
0060 self._header_lines = [header.split('\n') for header in self._header_str]
0061
0062
0063 logging.debug('header line number: %s' \
0064 % ', '.join([str(len(lines)) for lines in self._header_lines]))
0065 assert all([
0066 len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
0067 ), inconsistent_error_info % "line number does not match"
0068 inconsistent_lines_set = [set() for _ in self._header_lines]
0069 for line_zip in zip(*self._header_lines):
0070 if any([k in line_zip[0] for k in allow_diff_keys]):
0071 logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
0072 continue
0073 if not all([line_zip[0] == line for line in line_zip]):
0074
0075 for i, line in enumerate(line_zip):
0076 inconsistent_lines_set[i].add(line)
0077
0078 assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
0079 inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
0080
0081 def merge_headers(self):
0082 """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
0083
0084 self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
0085 if self._is_mglo and not self.bypass_check:
0086
0087 match_geninfo = [
0088 re.search(
0089 (r"\<MGGenerationInfo\>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
0090 r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+\<\/MGGenerationInfo\>"),
0091 header
0092 ) for header in self._header_str
0093 ]
0094 self._xsec_combined = sum(
0095 [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
0096 ) / sum(self._nevent)
0097 geninfo_combined = ("<MGGenerationInfo>\n"
0098 "# Number of Events : %d\n"
0099 "# Integrated weight (pb) : %.10f\n</MGGenerationInfo>") \
0100 % (sum(self._nevent), self._xsec_combined)
0101 logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
0102 for info, nevt in zip(match_geninfo, self._nevent):
0103 logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
0104 logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
0105 % (sum(self._nevent), self._xsec_combined))
0106
0107 header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
0108 return header_combined
0109
0110 else:
0111
0112 return self._header_str[0]
0113
0114 def merge_init_blocks(self):
0115 """If all <init> blocks are identical, return the same <init> block
0116 (in the case of Powheg LHEs); otherwise, calculate the output <init>
0117 blocks by merging the input blocks info using formula (same with the
0118 MG5LOLHEMerger scheme):
0119 XSECUP = sum(xsecup * no.events) / tot.events
0120 XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
0121 XMAXUP = max(xmaxup)
0122 """
0123
0124 if self.bypass_check:
0125
0126
0127 return self._init_str[0]
0128
0129
0130
0131 new_init_block = {}
0132 old_init_block = [{} for _ in self._init_str]
0133
0134
0135
0136 for i, bl in enumerate(self._init_str):
0137 nline = int(bl.split('\n')[0].strip().split()[-1])
0138
0139
0140 for bl_line in bl.split('\n')[1:nline + 1]:
0141 bl_line_sp = bl_line.split()
0142 old_init_block[i][int(bl_line_sp[3])] = [
0143 float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
0144
0145
0146
0147 if i == 0:
0148 info_after_subprocess = bl.strip().split('\n')[nline + 1:]
0149
0150 logging.info('Input file: %s' % self.input_files[i])
0151 for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
0152
0153 logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0154 % tuple(old_init_block[i][ipr] + [ipr]))
0155
0156
0157
0158
0159 if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
0160
0161 logging.info(
0162 'All input <init> blocks are identical. Output the same "<init> block.')
0163 return self._init_str[0]
0164
0165
0166 for i in range(len(self._f)):
0167 for ipr in old_init_block[i]:
0168
0169
0170 if ipr not in new_init_block:
0171 new_init_block[ipr] = [0., 0., 0.]
0172 new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i]
0173 new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2
0174 new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2])
0175 tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
0176
0177
0178 self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
0179 + ' ' + str(len(new_init_block)) + '\n'
0180
0181 logging.info('Output file: %s' % self.output_file)
0182 for ipr in sorted(list(new_init_block.keys()), reverse=True):
0183
0184 new_init_block[ipr][0] /= tot_nevent
0185 new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
0186 logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0187 % tuple(new_init_block[ipr] + [ipr]))
0188 self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
0189 self._merged_init_str += '\n'.join(info_after_subprocess)
0190 if len(info_after_subprocess):
0191 self._merged_init_str += '\n'
0192
0193 return self._merged_init_str
0194
0195 def merge(self):
0196 with open(self.output_file, 'w') as fw:
0197
0198 for i in range(len(self._f)):
0199 header = []
0200 line = next(self._f[i])
0201 while not re.search('\s*<init(>|\s)', line):
0202 header.append(line)
0203 line = next(self._f[i])
0204
0205 self._header_str.append(''.join(header))
0206 self.check_header_compatibility()
0207
0208
0209 for i in range(len(self._f)):
0210 init = []
0211 line = next(self._f[i])
0212 while not re.search('\s*</init>', line):
0213 init.append(line)
0214 line = next(self._f[i])
0215
0216 self._init_str.append(''.join(init))
0217
0218
0219
0220 with open('.tmp.lhe', 'w') as _fwtmp:
0221 for i in range(len(self._f)):
0222 nevent = 0
0223 while True:
0224 line = next(self._f[i])
0225 if re.search('\s*</event>', line):
0226 nevent += 1
0227 if re.search('\s*</LesHouchesEvents>', line):
0228 break
0229 _fwtmp.write(line)
0230 self._nevent.append(nevent)
0231 self._f[i].close()
0232
0233
0234 fw.write(self.merge_headers())
0235 fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
0236
0237
0238
0239
0240 if self._is_mglo and not self.bypass_check:
0241 event_norm = re.search(
0242 r'\s(\w+)\s*=\s*event_norm\s',
0243 self._header_str[0]).group(1)
0244 if event_norm == 'sum':
0245 self._uwgt = self._xsec_combined / sum(self._nevent)
0246 elif event_norm == 'average':
0247 self._uwgt = self._xsec_combined
0248 logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
0249 "recalculate weights in each event block.\n"
0250 "Unit weight: %+.7E") % (event_norm, self._uwgt))
0251
0252
0253 event_line = -999
0254 with open('.tmp.lhe', 'r') as ftmp:
0255 sign = lambda x: -1 if x < 0 else 1
0256 for line in ftmp:
0257 event_line += 1
0258 if re.search('\s*<event.*>', line) and not re.search('\s*<event_num.*>', line):
0259 event_line = 0
0260 if event_line == 1:
0261
0262
0263 orig_wgt = float(line.split()[2])
0264 fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
0265 % (sign(orig_wgt) * self._uwgt), line))
0266 elif re.search('\s*<wgt.*>.*</wgt>', line):
0267 addi_wgt_str = re.search(r'\<wgt.*\>\s*(\S+)\s*\<\/wgt\>', line).group(1)
0268 fw.write(line.replace(
0269 addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
0270 else:
0271 fw.write(line)
0272 else:
0273
0274 with open('.tmp.lhe', 'r') as ftmp:
0275 for line in ftmp:
0276 fw.write(line)
0277 fw.write('</LesHouchesEvents>\n')
0278 os.remove('.tmp.lhe')
0279
0280
0281 class MG5LOLHEMerger(BaseLHEMerger):
0282 """Use the merger script dedicated for MG5 LO LHEs, as introduced in
0283 https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
0284 """
0285
0286 def __init__(self, input_files, output_file, **kwargs):
0287 super(MG5LOLHEMerger, self).__init__(input_files, output_file)
0288 self._merger_script_url = \
0289 'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
0290
0291 def merge(self):
0292 logging.info(
0293 ('Use the merger script in genproductions dedicated for '
0294 'MadGraph5-produced LHEs'))
0295 os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
0296 % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
0297 os.system('gzip -df %s.gz' % self.output_file)
0298 os.system('rm banner.txt')
0299
0300
0301 class ExternalCppLHEMerger(BaseLHEMerger):
0302 """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
0303 https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
0304 """
0305
0306 def __init__(self, input_files, output_file, **kwargs):
0307 super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
0308 self._merger_script_url = \
0309 'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
0310
0311 def merge(self):
0312 logging.info(
0313 ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
0314 os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
0315 with open('mergeLheFiles.cpp') as f:
0316 script_str = f.read()
0317 with open('mergeLheFiles.cpp', 'w') as fw:
0318 fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
0319 with open('input_files.txt', 'w') as fw:
0320 fw.write('\n'.join(self.input_files) + '\n')
0321
0322 os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
0323 os.system('./mergeLheFiles input_files.txt')
0324 os.system('rm mergeLheFiles* input_files.txt')
0325
0326
0327 def main(argv = None):
0328 """Main routine of the script.
0329
0330 Arguments:
0331 - `argv`: arguments passed to the main routine
0332 """
0333
0334 if argv == None:
0335 argv = sys.argv[1:]
0336
0337 parser = argparse.ArgumentParser(
0338 description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
0339 "natural way.\n"
0340 "A detailed description of the merging step (in the default mode):\n"
0341 " 1. Header:\n"
0342 " a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
0343 " b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
0344 "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
0345 " 2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
0346 "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
0347 "each subprocess.\n"
0348 " 3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
0349 "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
0350 "run card.\n"
0351 "For further development of this script please always validate the merging result on the test "
0352 "routines: https://github.com/colizz/mergelhe_validate\n"
0353 "Example usage:\n"
0354 " mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
0355 formatter_class=argparse.RawTextHelpFormatter)
0356 parser.add_argument("-i", "--input-files", type=str,
0357 help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
0358 parser.add_argument("-o", "--output-file",
0359 default='output.lhe', type=str,
0360 help="Output LHE file path.")
0361 parser.add_argument("--force-mglo-merger", action='store_true',
0362 help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
0363 "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
0364 parser.add_argument("--force-cpp-merger", action='store_true',
0365 help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
0366 "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
0367 parser.add_argument("-b", "--bypass-check", action='store_true',
0368 help=("Bypass the compatibility check for the headers. If true, the header and init block "
0369 "will be just a duplicate from the first input file, and events are concatenated without "
0370 "modification."))
0371 parser.add_argument("-n", "--number-events", action='store_true',
0372 help=("Add a tag to number each lhe event. Needed for Herwig to find correct lhe events"))
0373 parser.add_argument("--debug", action='store_true',
0374 help="Use the debug mode.")
0375 args = parser.parse_args(argv)
0376
0377 logging.basicConfig(
0378 format='[%(levelname)s] %(message)s',
0379 level=logging.INFO if not args.debug else DEBUG)
0380 logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
0381
0382
0383 assert len(args.input_files), \
0384 ('Please specify your input LHE files by -i/--input-files. '
0385 'Run \'mergeLHE.py -h\' for details.')
0386 input_files = []
0387 for path in args.input_files.split(','):
0388 find_files = glob.glob(path)
0389 if len(find_files) == 0:
0390 logging.info('Warning: cannot find files in %s' % path)
0391 input_files += find_files
0392 input_files.sort()
0393 logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
0394 logging.info('>>> Write to output: %s ' % args.output_file)
0395
0396 if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
0397 os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
0398
0399 if args.number_events:
0400 offset = 0
0401 for input_file in input_files:
0402 offset += addLHEnumbers.number_events(input_file, offset=offset)
0403
0404
0405 assert len(input_files) > 0, 'Input LHE files should be more than 0.'
0406 if len(input_files) == 1:
0407 logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
0408 import shutil
0409 shutil.copy(input_files[0], args.output_file)
0410 return
0411 assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
0412 "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
0413
0414
0415 if args.force_mglo_merger:
0416 lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
0417 elif args.force_cpp_merger:
0418 lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
0419 else:
0420 lhe_merger = DefaultLHEMerger(
0421 input_files, args.output_file, bypass_check=args.bypass_check)
0422
0423
0424 lhe_merger.merge()
0425
0426
0427 if __name__=="__main__":
0428 main()