File indexing completed on 2024-04-06 12:13:50
0001
0002
0003 from __future__ import print_function
0004 import logging
0005 import argparse
0006 import math
0007 import glob
0008 import sys
0009 import os
0010 import re
0011
0012 import addLHEnumbers
0013
0014
0015 class BaseLHEMerger(object):
0016 """Base class of the LHE merge schemes"""
0017
0018 def __init__(self, input_files, output_file):
0019 self.input_files = input_files
0020 self.output_file = output_file
0021
0022 def merge(self):
0023 """Output the merged LHE"""
0024 pass
0025
0026 class DefaultLHEMerger(BaseLHEMerger):
0027 """Default LHE merge scheme that copies the header of the first LHE file,
0028 merges and outputs the init block, then concatenates all event blocks."""
0029
0030 def __init__(self, input_files, output_file, **kwargs):
0031 super(DefaultLHEMerger, self).__init__(input_files, output_file)
0032
0033 self.bypass_check = kwargs.get('bypass_check', False)
0034
0035 self._f = [self.file_iterator(name) for name in self.input_files]
0036 self._header_str = []
0037 self._is_mglo = False
0038 self._xsec_combined = 0.
0039 self._uwgt = 0.
0040 self._init_str = []
0041 self._nevent = []
0042
0043 def file_iterator(self, path):
0044 """Line-by-line iterator of a txt file"""
0045 with open(path, 'r') as f:
0046 for line in f:
0047 yield line
0048
0049 def check_header_compatibility(self):
0050 """Check if all headers for input files are consistent."""
0051
0052 if self.bypass_check:
0053 return
0054
0055 inconsistent_error_info = ("Incompatibility found in LHE headers: %s. "
0056 "Use -b/--bypass-check to bypass the check.")
0057 allow_diff_keys = [
0058 'nevent', 'numevts', 'iseed', 'Seed', 'Random', '.log', '.dat', '.lhe',
0059 'Number of Events', 'Integrated weight'
0060 ]
0061 self._header_lines = [header.split('\n') for header in self._header_str]
0062
0063
0064 logging.debug('header line number: %s' \
0065 % ', '.join([str(len(lines)) for lines in self._header_lines]))
0066 assert all([
0067 len(self._header_lines[0]) == len(lines) for lines in self._header_lines]
0068 ), inconsistent_error_info % "line number does not match"
0069 inconsistent_lines_set = [set() for _ in self._header_lines]
0070 for line_zip in zip(*self._header_lines):
0071 if any([k in line_zip[0] for k in allow_diff_keys]):
0072 logging.debug('Captured \'%s\', we allow difference in this line' % line_zip[0])
0073 continue
0074 if not all([line_zip[0] == line for line in line_zip]):
0075
0076 for i, line in enumerate(line_zip):
0077 inconsistent_lines_set[i].add(line)
0078
0079 assert all([inconsistent_lines_set[0] == lset for lset in inconsistent_lines_set]), \
0080 inconsistent_error_info % ('{' + ', '.join(inconsistent_lines_set[0]) + '}')
0081
0082 def merge_headers(self):
0083 """Merge the headers of input LHEs. Need special handle for the MG5 LO case."""
0084
0085 self._is_mglo = all(['MGGenerationInfo' in header for header in self._header_str])
0086 if self._is_mglo and not self.bypass_check:
0087
0088 match_geninfo = [
0089 re.search(
0090 (r"\<MGGenerationInfo\>\s+#\s*Number of Events\s*\:\s*(\S+)\s+"
0091 r"#\s*Integrated weight \(pb\)\s*\:\s*(\S+)\s+\<\/MGGenerationInfo\>"),
0092 header
0093 ) for header in self._header_str
0094 ]
0095 self._xsec_combined = sum(
0096 [float(info.group(2)) * nevt for info, nevt in zip(match_geninfo, self._nevent)]
0097 ) / sum(self._nevent)
0098 geninfo_combined = ("<MGGenerationInfo>\n"
0099 "# Number of Events : %d\n"
0100 "# Integrated weight (pb) : %.10f\n</MGGenerationInfo>") \
0101 % (sum(self._nevent), self._xsec_combined)
0102 logging.info('Detected: MG5 LO LHEs. Input <MGGenerationInfo>:\n\tnevt\txsec')
0103 for info, nevt in zip(match_geninfo, self._nevent):
0104 logging.info('\t%d\t%.10f' % (nevt, float(info.group(2))))
0105 logging.info('Combined <MGGenerationInfo>:\n\t%d\t%.10f' \
0106 % (sum(self._nevent), self._xsec_combined))
0107
0108 header_combined = self._header_str[0].replace(match_geninfo[0].group(), geninfo_combined)
0109 return header_combined
0110
0111 else:
0112
0113 return self._header_str[0]
0114
0115 def merge_init_blocks(self):
0116 """If all <init> blocks are identical, return the same <init> block
0117 (in the case of Powheg LHEs); otherwise, calculate the output <init>
0118 blocks by merging the input blocks info using formula (same with the
0119 MG5LOLHEMerger scheme):
0120 XSECUP = sum(xsecup * no.events) / tot.events
0121 XERRUP = sqrt( sum(sigma^2 * no.events^2) ) / tot.events
0122 XMAXUP = max(xmaxup)
0123 """
0124
0125 if self.bypass_check:
0126
0127
0128 return self._init_str[0]
0129
0130
0131
0132 new_init_block = {}
0133 old_init_block = [{} for _ in self._init_str]
0134
0135
0136
0137 for i, bl in enumerate(self._init_str):
0138 nline = int(bl.split('\n')[0].strip().split()[-1])
0139
0140
0141 for bl_line in bl.split('\n')[1:nline + 1]:
0142 bl_line_sp = bl_line.split()
0143 old_init_block[i][int(bl_line_sp[3])] = [
0144 float(bl_line_sp[0]), float(bl_line_sp[1]), float(bl_line_sp[2])]
0145
0146
0147
0148 if i == 0:
0149 info_after_subprocess = bl.strip().split('\n')[nline + 1:]
0150
0151 logging.info('Input file: %s' % self.input_files[i])
0152 for ipr in sorted(list(old_init_block[i].keys()), reverse=True):
0153
0154 logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0155 % tuple(old_init_block[i][ipr] + [ipr]))
0156
0157
0158
0159
0160 if all([old_init_block[i] == old_init_block[0] for i in range(len(self._f))]):
0161
0162 logging.info(
0163 'All input <init> blocks are identical. Output the same "<init> block.')
0164 return self._init_str[0]
0165
0166
0167 for i in range(len(self._f)):
0168 for ipr in old_init_block[i]:
0169
0170
0171 if ipr not in new_init_block:
0172 new_init_block[ipr] = [0., 0., 0.]
0173 new_init_block[ipr][0] += old_init_block[i][ipr][0] * self._nevent[i]
0174 new_init_block[ipr][1] += old_init_block[i][ipr][1]**2 * self._nevent[i]**2
0175 new_init_block[ipr][2] = max(new_init_block[ipr][2], old_init_block[i][ipr][2])
0176 tot_nevent = sum([self._nevent[i] for i in range(len(self._f))])
0177
0178
0179 self._merged_init_str = self._init_str[0].split('\n')[0].strip().rsplit(' ', 1)[0] \
0180 + ' ' + str(len(new_init_block)) + '\n'
0181
0182 logging.info('Output file: %s' % self.output_file)
0183 for ipr in sorted(list(new_init_block.keys()), reverse=True):
0184
0185 new_init_block[ipr][0] /= tot_nevent
0186 new_init_block[ipr][1] = math.sqrt(new_init_block[ipr][1]) / tot_nevent
0187 logging.info(' xsecup, xerrup, xmaxup, lprup: %.6E, %.6E, %.6E, %d' \
0188 % tuple(new_init_block[ipr] + [ipr]))
0189 self._merged_init_str += '%.6E %.6E %.6E %d\n' % tuple(new_init_block[ipr] + [ipr])
0190 self._merged_init_str += '\n'.join(info_after_subprocess)
0191 if len(info_after_subprocess):
0192 self._merged_init_str += '\n'
0193
0194 return self._merged_init_str
0195
0196 def merge(self):
0197 with open(self.output_file, 'w') as fw:
0198
0199 for i in range(len(self._f)):
0200 header = []
0201 line = next(self._f[i])
0202 while not re.search('\s*<init(>|\s)', line):
0203 header.append(line)
0204 line = next(self._f[i])
0205
0206 self._header_str.append(''.join(header))
0207 self.check_header_compatibility()
0208
0209
0210 for i in range(len(self._f)):
0211 init = []
0212 line = next(self._f[i])
0213 while not re.search('\s*</init>', line):
0214 init.append(line)
0215 line = next(self._f[i])
0216
0217 self._init_str.append(''.join(init))
0218
0219
0220
0221 with open('.tmp.lhe', 'w') as _fwtmp:
0222 for i in range(len(self._f)):
0223 nevent = 0
0224 while True:
0225 line = next(self._f[i])
0226 if re.search('\s*</event>', line):
0227 nevent += 1
0228 if re.search('\s*</LesHouchesEvents>', line):
0229 break
0230 _fwtmp.write(line)
0231 self._nevent.append(nevent)
0232 self._f[i].close()
0233
0234
0235 fw.write(self.merge_headers())
0236 fw.write('<init>\n' + self.merge_init_blocks() + '</init>\n')
0237
0238
0239
0240
0241 if self._is_mglo and not self.bypass_check:
0242 event_norm = re.search(
0243 r'\s(\w+)\s*=\s*event_norm\s',
0244 self._header_str[0]).group(1)
0245 if event_norm == 'sum':
0246 self._uwgt = self._xsec_combined / sum(self._nevent)
0247 elif event_norm == 'average':
0248 self._uwgt = self._xsec_combined
0249 logging.info(("MG5 LO LHE with event_norm = %s detected. Will "
0250 "recalculate weights in each event block.\n"
0251 "Unit weight: %+.7E") % (event_norm, self._uwgt))
0252
0253
0254 event_line = -999
0255 with open('.tmp.lhe', 'r') as ftmp:
0256 sign = lambda x: -1 if x < 0 else 1
0257 for line in ftmp:
0258 event_line += 1
0259 if re.search('\s*<event.*>', line) and not re.search('\s*<event_num.*>', line):
0260 event_line = 0
0261 if event_line == 1:
0262
0263
0264 orig_wgt = float(line.split()[2])
0265 fw.write(re.sub(r'(^\s*\S+\s+\S+\s+)\S+(.+)', r'\g<1>%+.7E\g<2>' \
0266 % (sign(orig_wgt) * self._uwgt), line))
0267 elif re.search('\s*<wgt.*>.*</wgt>', line):
0268 addi_wgt_str = re.search(r'\<wgt.*\>\s*(\S+)\s*\<\/wgt\>', line).group(1)
0269 fw.write(line.replace(
0270 addi_wgt_str, '%+.7E' % (float(addi_wgt_str) / orig_wgt * self._uwgt)))
0271 else:
0272 fw.write(line)
0273 else:
0274
0275 with open('.tmp.lhe', 'r') as ftmp:
0276 for line in ftmp:
0277 fw.write(line)
0278 fw.write('</LesHouchesEvents>\n')
0279 os.remove('.tmp.lhe')
0280
0281
0282 class MG5LOLHEMerger(BaseLHEMerger):
0283 """Use the merger script dedicated for MG5 LO LHEs, as introduced in
0284 https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl
0285 """
0286
0287 def __init__(self, input_files, output_file, **kwargs):
0288 super(MG5LOLHEMerger, self).__init__(input_files, output_file)
0289 self._merger_script_url = \
0290 'https://raw.githubusercontent.com/cms-sw/genproductions/5c1e865a6fbe3a762a28363835d9a804c9cf0dbe/bin/MadGraph5_aMCatNLO/Utilities/merge.pl'
0291
0292 def merge(self):
0293 logging.info(
0294 ('Use the merger script in genproductions dedicated for '
0295 'MadGraph5-produced LHEs'))
0296 os.system('curl -s -L %s | perl - %s %s.gz banner.txt' \
0297 % (self._merger_script_url, ' '.join(self.input_files), self.output_file))
0298 os.system('gzip -df %s.gz' % self.output_file)
0299 os.system('rm banner.txt')
0300
0301
0302 class ExternalCppLHEMerger(BaseLHEMerger):
0303 """Use the external mergeLheFiles.cpp file to merge LHE files, as introduced in
0304 https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns
0305 """
0306
0307 def __init__(self, input_files, output_file, **kwargs):
0308 super(ExternalCppLHEMerger, self).__init__(input_files, output_file)
0309 self._merger_script_url = \
0310 'https://twiki.cern.ch/twiki/bin/viewfile/CMSPublic/SWGuideSubgroupMC?filename=mergeLheFiles.cpp;rev=2'
0311
0312 def merge(self):
0313 logging.info(
0314 ('Use the external mergeLheFiles.cpp file to merge LHE files.'))
0315 os.system('curl -s -o mergeLheFiles.cpp %s' % self._merger_script_url)
0316 with open('mergeLheFiles.cpp') as f:
0317 script_str = f.read()
0318 with open('mergeLheFiles.cpp', 'w') as fw:
0319 fw.write(script_str.replace('/tmp/covarell/out.lhe', self.output_file))
0320 with open('input_files.txt', 'w') as fw:
0321 fw.write('\n'.join(self.input_files) + '\n')
0322
0323 os.system('g++ -Wall -o mergeLheFiles mergeLheFiles.cpp')
0324 os.system('./mergeLheFiles input_files.txt')
0325 os.system('rm mergeLheFiles* input_files.txt')
0326
0327
0328 def main(argv = None):
0329 """Main routine of the script.
0330
0331 Arguments:
0332 - `argv`: arguments passed to the main routine
0333 """
0334
0335 if argv == None:
0336 argv = sys.argv[1:]
0337
0338 parser = argparse.ArgumentParser(
0339 description=("A universal script that merges multiple LHE files for all possible conditions and in the most "
0340 "natural way.\n"
0341 "A detailed description of the merging step (in the default mode):\n"
0342 " 1. Header:\n"
0343 " a. assert consistency of the headers (allow difference for the info of e.g. #event, seed);\n"
0344 " b. if not MG LO LHEs, will simply use the header from the first LHE; otherwise, reset the "
0345 "<MGGenerationInfo> from the headers by merging the #event & xsec info;\n"
0346 " 2. Init block: if all <init> blocks are the same, use the same as output; otherwise (the MG LO "
0347 "case), merge them by recalculating the # of subprocess (LRPUP) and XSECUP, XERRUP, XMAXUP per "
0348 "each subprocess.\n"
0349 " 3. Event block: concatenate all event blocks. If for MG LO LHEs, recalculate the per-event "
0350 "XWGTUP and all <wgt> tags based on the new XSECUP, #event, and 'event_norm' read from the MG "
0351 "run card.\n"
0352 "For further development of this script please always validate the merging result on the test "
0353 "routines: https://github.com/colizz/mergelhe_validate\n"
0354 "Example usage:\n"
0355 " mergeLHE.py -i 'thread*/*.lhe,another_file/another.lhe' -o output.lhe"),
0356 formatter_class=argparse.RawTextHelpFormatter)
0357 parser.add_argument("-i", "--input-files", type=str,
0358 help="Input LHE file paths separated by commas. Shell-type wildcards are supported.")
0359 parser.add_argument("-o", "--output-file",
0360 default='output.lhe', type=str,
0361 help="Output LHE file path.")
0362 parser.add_argument("--force-mglo-merger", action='store_true',
0363 help=("Force to use the merger script dedicated for MG5 LO LHEs, as introduced in "
0364 "https://github.com/cms-sw/genproductions/blob/master/bin/MadGraph5_aMCatNLO/Utilities/merge.pl"))
0365 parser.add_argument("--force-cpp-merger", action='store_true',
0366 help=("Force to use the external mergeLheFiles.cpp file to merge LHE files, as introduced in "
0367 "https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWGuideSubgroupMC#1_2_Using_pLHE_campaigns"))
0368 parser.add_argument("-b", "--bypass-check", action='store_true',
0369 help=("Bypass the compatibility check for the headers. If true, the header and init block "
0370 "will be just a duplicate from the first input file, and events are concatenated without "
0371 "modification."))
0372 parser.add_argument("-n", "--number-events", action='store_true',
0373 help=("Add a tag to number each lhe event. Needed for Herwig to find correct lhe events"))
0374 parser.add_argument("--debug", action='store_true',
0375 help="Use the debug mode.")
0376 args = parser.parse_args(argv)
0377
0378 logging.basicConfig(
0379 format='[%(levelname)s] %(message)s',
0380 level=logging.INFO if not args.debug else DEBUG)
0381 logging.info('>>> launch mergeLHE.py in %s' % os.path.abspath(os.getcwd()))
0382
0383
0384 assert len(args.input_files), \
0385 ('Please specify your input LHE files by -i/--input-files. '
0386 'Run \'mergeLHE.py -h\' for details.')
0387 input_files = []
0388 for path in args.input_files.split(','):
0389 find_files = glob.glob(path)
0390 if len(find_files) == 0:
0391 logging.info('Warning: cannot find files in %s' % path)
0392 input_files += find_files
0393 input_files.sort()
0394 logging.info('>>> Merge %d files: [%s]' % (len(input_files), ', '.join(input_files)))
0395 logging.info('>>> Write to output: %s ' % args.output_file)
0396
0397 if not os.path.exists(os.path.dirname(os.path.realpath(args.output_file))):
0398 os.makedirs(os.path.dirname(os.path.realpath(args.output_file)))
0399
0400 if args.number_events:
0401 offset = 0
0402 for input_file in input_files:
0403 offset += addLHEnumbers.number_events(input_file, offset=offset)
0404
0405
0406 assert len(input_files) > 0, 'Input LHE files should be more than 0.'
0407 if len(input_files) == 1:
0408 logging.warning('Input LHE only has 1 file. Will copy this file to the destination.')
0409 import shutil
0410 shutil.copy(input_files[0], args.output_file)
0411 return
0412 assert [args.force_mglo_merger, args.force_cpp_merger].count(True) <= 1, \
0413 "Can only specify at most one from --force-mglo-merger or --force-cpp-merger."
0414
0415
0416 if args.force_mglo_merger:
0417 lhe_merger = MG5LOLHEMerger(input_files, args.output_file)
0418 elif args.force_cpp_merger:
0419 lhe_merger = ExternalCppLHEMerger(input_files, args.output_file)
0420 else:
0421 lhe_merger = DefaultLHEMerger(
0422 input_files, args.output_file, bypass_check=args.bypass_check)
0423
0424
0425 lhe_merger.merge()
0426
0427
0428 if __name__=="__main__":
0429 main()