CondHDF5ESSource/python/hdf5Writer.py

0001 import h5py
0002 import zlib
0003 import lzma
0004 import numpy as np
0005
0006 #The file structure
0007 #
0008 # "format_version" - Attribute says which version of the file format was used
0009 # "default_payload_compressor" - Attribute name of compressor used for the payloads
0010 #
0011 # "Records"- Group
0012 #   <Record> - Group name is the EventSetup record name
0013 #      "DataProducts" - Group
0014 #         <data product> - Group name is the '<type>@<label>' combination
0015 #            "type" - Attribute, the C++ canonical type name
0016 #            "Payloads" - Group
0017 #               <payload> - DataSet name is hash used in DB
0018 #                  "memsize" = Attribute bytes needed after decompression
0019 #                  "type" = Attribute the actual type stored (for polymorphism)
0020 #      "Tags" - Group
0021 #        <tag> - Group name is
0022 #                   same as DB if only one data product is in the tag
0023 #                   a hybrid name formed from the different DB tags it merged
0024 #          "products" - Attribute, list of the data products used in the order they appear in "payload"
0025 #          "time_type" - Attribute, either 'run_lumi' or 'time'
0026 #          "db_tags" - Attribute the list of DB tags that were combine
0027 #          "record" - Attribute name of the record to which the tag is associated (optimizes readback)
0028 #          "first" - DataSet holds the beginning IOVSyncValue for the IOVs
0029 #          "last" - DataSet holds the end IOVSyncValue for the IOVS
0030 #          "payload" - DataSet references to the payloads for this IOV for each data product
0031 #
0032 # "GlobalTags" - Group
0033 #   <global tag> - Group name is the global tag name
0034 #      "Tags" - DataSet holds references to the tags
0035
0036
0037 def writeTagImpl(tagsGroup, name, recName, time_type, IOV_payloads, payloadToRefs, productNames, originalTagNames):
0038     tagGroup = tagsGroup.create_group(name)
0039     tagGroup.attrs["time_type"] = time_type.encode("ascii")
0040     tagGroup.attrs["db_tags"] = [x.encode("ascii") for x in originalTagNames]
0041     tagGroup.attrs["record"] = recName.encode("ascii")
0042     tagGroup.attrs['products'] = [x.encode("ascii") for x in productNames]
0043     firstValues = [x[0] for x in IOV_payloads]
0044     lastValues = [x[1] for x in IOV_payloads]
0045     syncValueType = np.dtype([("high", np.uint32),("low", np.uint32)])
0046     first_np = np.empty(shape=(len(IOV_payloads),), dtype=syncValueType)
0047     first_np['high'] = [ x.high for x in firstValues]
0048     first_np['low'] = [ x.low for x in firstValues]
0049     last_np = np.empty(shape=(len(lastValues),), dtype=syncValueType)
0050     last_np['high'] = [ x.high for x in lastValues]
0051     last_np['low'] = [ x.low for x in lastValues]
0052     #tagGroup.create_dataset("first",data=np.array(firstValues), dtype=syncValueType)
0053     #tagGroup.create_dataset("last", data=np.array(lastValues),dtype=syncValueType)
0054     payloads = [ [ payloadToRefs[y] for y in x[2]] for x in IOV_payloads]
0055     compressor = None
0056     if len(first_np) > 100:
0057         compressor = 'gzip'
0058     tagGroup.create_dataset("first",data=first_np, compression = compressor)
0059     tagGroup.create_dataset("last",data=last_np, compression = compressor)
0060     tagGroup.create_dataset("payload", data=payloads, dtype=h5py.ref_dtype, compression = compressor)
0061     return tagGroup.ref
0062
0063
0064 def writeTag(tagsGroup, time_type, IOV_payloads, payloadToRefs, originalTagNames, recName, productNames):
0065     name = originalTagNames[0]
0066     if len(originalTagNames) != 1:
0067         name = name+"@joined"
0068     return writeTagImpl(tagsGroup, name, recName, time_type, IOV_payloads, payloadToRefs, productNames, originalTagNames)
0069
0070 def writeH5File(fileName, globalTags, excludeRecords, includeRecords, tagReader, compressorName):
0071     #what are key lists??? They seem to hold objects of type 'cond::persistency::KeyList'
0072     # and have their own proxy type
0073     keyListRecords = set(["ExDwarfListRcd", "DTKeyedConfigListRcd", "DTKeyedConfigContainerRcd"])
0074
0075     default_compressor_name = compressorName
0076     print(default_compressor_name)
0077     default_compressor = None
0078     if default_compressor_name == "zlib":
0079         default_compressor = zlib
0080     elif default_compressor_name == "lzma":
0081         default_compressor = lzma
0082     with h5py.File(fileName, 'w') as h5file:
0083         h5file.attrs["file_format"] = 1
0084         h5file.attrs["default_payload_compressor"] = default_compressor_name.encode("ascii")
0085         recordsGroup = h5file.create_group("Records")
0086         globalTagsGroup = h5file.create_group("GlobalTags")
0087         null_dataset = h5file.create_dataset("null_payload", data=np.array([], dtype='b') )
0088         tagGroupRefs = []
0089
0090         for name in globalTags:
0091             gt = tagReader(name)
0092             for tag in gt.tags():
0093                 rcd = tag.record()
0094                 if rcd in keyListRecords:
0095                     continue
0096                 if rcd in excludeRecords:
0097                     continue
0098                 if includeRecords and (not rcd in includeRecords):
0099                     continue
0100                 recordDataSize = 0
0101
0102                 payloadToRefs = { None: null_dataset.ref}
0103
0104                 recordGroup = recordsGroup.create_group(rcd)
0105                 tagsGroup = recordGroup.create_group("Tags")
0106                 dataProductsGroup = recordGroup.create_group("DataProducts")
0107                 print("record: %s"%rcd)
0108                 productNames = []
0109                 for dataProduct in tag.dataProducts():
0110                     productNames.append(dataProduct.name())
0111                     dataProductGroup = dataProductsGroup.create_group(dataProduct.name())
0112                     dataProductGroup.attrs["type"] = dataProduct.objtype().encode("ascii")
0113                     payloadsGroup = dataProductGroup.create_group("Payloads")
0114                     print(" product: %s"%dataProduct.name())
0115                     for p_index, payload in enumerate(dataProduct.payloads()):
0116                         print("  %i payload: %s size: %i"%(p_index,payload.name(),len(payload.data())))
0117                         recordDataSize +=len(payload.data())
0118                         if default_compressor:
0119                             b = default_compressor.compress(payload.data())
0120                             if len(b) >= len(payload.data()):
0121                                 #compressing isn't helping
0122                                 b = payload.data()
0123                         else:
0124                             b = payload.data()
0125                         pl = payloadsGroup.create_dataset(payload.name(), data=np.frombuffer(b,dtype='b'))
0126                         pl.attrs["memsize"] = len(payload.data())
0127                         pl.attrs["type"] = payload.actualType()
0128                         payloadToRefs[payload.name()] = pl.ref
0129
0130                 tagGroupRefs.append(writeTag(tagsGroup, tag.time_type(), tag.iovsNPayloadNames(), payloadToRefs, tag.originalTagNames(), rcd, productNames))
0131                 print(" total size:",recordDataSize)
0132                 recordDataSize = 0
0133
0134             globalTagGroup = globalTagsGroup.create_group(name)
0135             globalTagGroup.create_dataset("Tags", data=tagGroupRefs, dtype=h5py.ref_dtype)