Math/interface/libminifloat.h

0001 #ifndef libminifloat_h
0002 #define libminifloat_h
0003 #include "FWCore/Utilities/interface/thread_safety_macros.h"
0004 #include "FWCore/Utilities/interface/bit_cast.h"
0005 #include <cstdint>
0006 #include <cassert>
0007 #include <algorithm>
0008
0009 // ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
0010 class MiniFloatConverter {
0011 public:
0012   MiniFloatConverter();
0013   inline static float float16to32(uint16_t h) {
0014     uint32_t i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
0015     return edm::bit_cast<float>(i32);
0016   }
0017   inline static uint16_t float32to16(float x) { return float32to16round(x); }
0018   /// Fast implementation, but it crops the number so it biases low
0019   inline static uint16_t float32to16crop(float x) {
0020     uint32_t i32 = edm::bit_cast<uint32_t>(x);
0021     return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
0022   }
0023   /// Slower implementation, but it rounds to avoid biases
0024   inline static uint16_t float32to16round(float x) {
0025     uint32_t i32 = edm::bit_cast<uint32_t>(x);
0026     uint8_t shift = shifttable[(i32 >> 23) & 0x1ff];
0027     if (shift == 13) {
0028       uint16_t base2 = (i32 & 0x007fffff) >> 12;
0029       uint16_t base = base2 >> 1;
0030       if (((base2 & 1) != 0) && (base < 1023))
0031         base++;
0032       return basetable[(i32 >> 23) & 0x1ff] + base;
0033     } else {
0034       return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
0035     }
0036   }
0037   template <int bits>
0038   inline static float reduceMantissaToNbits(const float &f) {
0039     static_assert(bits <= 23, "max mantissa size is 23 bits");
0040     constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
0041     uint32_t i32 = edm::bit_cast<uint32_t>(f);
0042     i32 &= mask;
0043     return edm::bit_cast<float>(i32);
0044   }
0045   inline static float reduceMantissaToNbits(const float &f, int bits) {
0046     uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
0047     uint32_t i32 = edm::bit_cast<uint32_t>(f);
0048     i32 &= mask;
0049     return edm::bit_cast<float>(i32);
0050   }
0051
0052   class ReduceMantissaToNbitsRounding {
0053   public:
0054     ReduceMantissaToNbitsRounding(int bits)
0055         : shift(23 - bits), mask((0xFFFFFFFF >> (shift)) << (shift)), test(1 << (shift - 1)), maxn((1 << bits) - 2) {
0056       assert(bits <= 23);  // "max mantissa size is 23 bits"
0057     }
0058     float operator()(float f) const {
0059       constexpr uint32_t low23 = (0x007FFFFF);  // mask to keep lowest 23 bits = mantissa
0060       constexpr uint32_t hi9 = (0xFF800000);    // mask to keep highest 9 bits = the rest
0061       uint32_t i32 = edm::bit_cast<uint32_t>(f);
0062       if (i32 & test) {  // need to round
0063         uint32_t mantissa = (i32 & low23) >> shift;
0064         if (mantissa < maxn)
0065           mantissa++;
0066         i32 = (i32 & hi9) | (mantissa << shift);
0067       } else {
0068         i32 &= mask;
0069       }
0070       return edm::bit_cast<float>(i32);
0071     }
0072
0073   private:
0074     const int shift;
0075     const uint32_t mask, test, maxn;
0076   };
0077
0078   template <int bits>
0079   inline static float reduceMantissaToNbitsRounding(const float &f) {
0080     static const ReduceMantissaToNbitsRounding reducer(bits);
0081     return reducer(f);
0082   }
0083
0084   inline static float reduceMantissaToNbitsRounding(float f, int bits) {
0085     return ReduceMantissaToNbitsRounding(bits)(f);
0086   }
0087
0088   template <typename InItr, typename OutItr>
0089   static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out) {
0090     std::transform(begin, end, out, ReduceMantissaToNbitsRounding(bits));
0091   }
0092
0093   inline static float max() {
0094     constexpr uint32_t i32 = 0x477fe000;  // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
0095     return edm::bit_cast<float>(i32);
0096   }
0097
0098   // Maximum float32 value that gets rounded to max()
0099   inline static float max32RoundedToMax16() {
0100     // 2^16 in float32 is the first to result inf in float16, so
0101     // 2^16-1 is the last float32 to result max() in float16
0102     constexpr uint32_t i32 = (0x8f << 23) - 1;
0103     return edm::bit_cast<float>(i32);
0104   }
0105
0106   inline static float min() {
0107     constexpr uint32_t i32 = 0x38800000;  // = mantissatable[offsettable[1]+0]+exponenttable[1]
0108     return edm::bit_cast<float>(i32);
0109   }
0110
0111   // Minimum float32 value that gets rounded to min()
0112   inline static float min32RoundedToMin16() {
0113     // 2^-14-1 in float32 is the first to result denormalized in float16, so
0114     // 2^-14 is the first float32 to result min() in float16
0115     constexpr uint32_t i32 = (0x71 << 23);
0116     return edm::bit_cast<float>(i32);
0117   }
0118
0119   inline static float denorm_min() {
0120     constexpr uint32_t i32 = 0x33800000;  // mantissatable[offsettable[0]+1]+exponenttable[0]
0121     return edm::bit_cast<float>(i32);
0122   }
0123
0124   inline static bool isdenorm(uint16_t h) {
0125     // if exponent is zero (sign-bit excluded of course) and mantissa is not zero
0126     return ((h >> 10) & 0x1f) == 0 && (h & 0x3ff) != 0;
0127   }
0128
0129 private:
0130   CMS_THREAD_SAFE static uint32_t mantissatable[2048];
0131   CMS_THREAD_SAFE static uint32_t exponenttable[64];
0132   CMS_THREAD_SAFE static uint16_t offsettable[64];
0133   CMS_THREAD_SAFE static uint16_t basetable[512];
0134   CMS_THREAD_SAFE static uint8_t shifttable[512];
0135   static void filltables();
0136 };
0137 #endif