Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-10-30 00:11:28

0001 #ifndef libminifloat_h
0002 #define libminifloat_h
0003 #include "FWCore/Utilities/interface/thread_safety_macros.h"
0004 #include "FWCore/Utilities/interface/bit_cast.h"
0005 #include <cstdint>
0006 #include <cassert>
0007 #include <algorithm>
0008 
0009 // ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
0010 class MiniFloatConverter {
0011 public:
0012   MiniFloatConverter();
0013   inline static float float16to32(uint16_t h) {
0014     uint32_t i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
0015     return edm::bit_cast<float>(i32);
0016   }
0017   inline static uint16_t float32to16(float x) { return float32to16round(x); }
0018   /// Fast implementation, but it crops the number so it biases low
0019   inline static uint16_t float32to16crop(float x) {
0020     uint32_t i32 = edm::bit_cast<uint32_t>(x);
0021     return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
0022   }
0023   /// Slower implementation, but it rounds to avoid biases
0024   inline static uint16_t float32to16round(float x) {
0025     uint32_t i32 = edm::bit_cast<uint32_t>(x);
0026     uint8_t shift = shifttable[(i32 >> 23) & 0x1ff];
0027     if (shift == 13) {
0028       uint16_t base2 = (i32 & 0x007fffff) >> 12;
0029       uint16_t base = base2 >> 1;
0030       if (((base2 & 1) != 0) && (base < 1023))
0031         base++;
0032       return basetable[(i32 >> 23) & 0x1ff] + base;
0033     } else {
0034       return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
0035     }
0036   }
0037   template <int bits>
0038   inline static float reduceMantissaToNbits(const float &f) {
0039     static_assert(bits <= 23, "max mantissa size is 23 bits");
0040     constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
0041     uint32_t i32 = edm::bit_cast<uint32_t>(f);
0042     i32 &= mask;
0043     return edm::bit_cast<float>(i32);
0044   }
0045   inline static float reduceMantissaToNbits(const float &f, int bits) {
0046     uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
0047     uint32_t i32 = edm::bit_cast<uint32_t>(f);
0048     i32 &= mask;
0049     return edm::bit_cast<float>(i32);
0050   }
0051 
0052   class ReduceMantissaToNbitsRounding {
0053   public:
0054 #ifdef CMS_UNDEFINED_SANITIZER
0055     //Supress UBSan runtime error about -ve shift. This happens when bits==23
0056     __attribute__((no_sanitize("shift")))
0057 #endif
0058     ReduceMantissaToNbitsRounding(int bits)
0059         : shift(23 - bits), mask((0xFFFFFFFF >> (shift)) << (shift)), test(1 << (shift - 1)), maxn((1 << bits) - 2) {
0060       assert(bits <= 23);  // "max mantissa size is 23 bits"
0061     }
0062     float operator()(float f) const {
0063       constexpr uint32_t low23 = (0x007FFFFF);  // mask to keep lowest 23 bits = mantissa
0064       constexpr uint32_t hi9 = (0xFF800000);    // mask to keep highest 9 bits = the rest
0065       uint32_t i32 = edm::bit_cast<uint32_t>(f);
0066       if (i32 & test) {  // need to round
0067         uint32_t mantissa = (i32 & low23) >> shift;
0068         if (mantissa < maxn)
0069           mantissa++;
0070         i32 = (i32 & hi9) | (mantissa << shift);
0071       } else {
0072         i32 &= mask;
0073       }
0074       return edm::bit_cast<float>(i32);
0075     }
0076 
0077   private:
0078     const int shift;
0079     const uint32_t mask, test, maxn;
0080   };
0081 
0082   template <int bits>
0083   inline static float reduceMantissaToNbitsRounding(const float &f) {
0084     static const ReduceMantissaToNbitsRounding reducer(bits);
0085     return reducer(f);
0086   }
0087 
0088   inline static float reduceMantissaToNbitsRounding(float f, int bits) {
0089     return ReduceMantissaToNbitsRounding(bits)(f);
0090   }
0091 
0092   template <typename InItr, typename OutItr>
0093   static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out) {
0094     std::transform(begin, end, out, ReduceMantissaToNbitsRounding(bits));
0095   }
0096 
0097   inline static float max() {
0098     constexpr uint32_t i32 = 0x477fe000;  // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
0099     return edm::bit_cast<float>(i32);
0100   }
0101 
0102   // Maximum float32 value that gets rounded to max()
0103   inline static float max32RoundedToMax16() {
0104     // 2^16 in float32 is the first to result inf in float16, so
0105     // 2^16-1 is the last float32 to result max() in float16
0106     constexpr uint32_t i32 = (0x8f << 23) - 1;
0107     return edm::bit_cast<float>(i32);
0108   }
0109 
0110   inline static float min() {
0111     constexpr uint32_t i32 = 0x38800000;  // = mantissatable[offsettable[1]+0]+exponenttable[1]
0112     return edm::bit_cast<float>(i32);
0113   }
0114 
0115   // Minimum float32 value that gets rounded to min()
0116   inline static float min32RoundedToMin16() {
0117     // 2^-14-1 in float32 is the first to result denormalized in float16, so
0118     // 2^-14 is the first float32 to result min() in float16
0119     constexpr uint32_t i32 = (0x71 << 23);
0120     return edm::bit_cast<float>(i32);
0121   }
0122 
0123   inline static float denorm_min() {
0124     constexpr uint32_t i32 = 0x33800000;  // mantissatable[offsettable[0]+1]+exponenttable[0]
0125     return edm::bit_cast<float>(i32);
0126   }
0127 
0128   inline static bool isdenorm(uint16_t h) {
0129     // if exponent is zero (sign-bit excluded of course) and mantissa is not zero
0130     return ((h >> 10) & 0x1f) == 0 && (h & 0x3ff) != 0;
0131   }
0132 
0133 private:
0134   CMS_THREAD_SAFE static uint32_t mantissatable[2048];
0135   CMS_THREAD_SAFE static uint32_t exponenttable[64];
0136   CMS_THREAD_SAFE static uint16_t offsettable[64];
0137   CMS_THREAD_SAFE static uint16_t basetable[512];
0138   CMS_THREAD_SAFE static uint8_t shifttable[512];
0139   static void filltables();
0140 };
0141 #endif