1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
#ifndef libminifloat_h
#define libminifloat_h
#include "FWCore/Utilities/interface/thread_safety_macros.h"
#include "FWCore/Utilities/interface/bit_cast.h"
#include <cstdint>
#include <cassert>
#include <algorithm>
// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
class MiniFloatConverter {
public:
MiniFloatConverter();
inline static float float16to32(uint16_t h) {
uint32_t i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
return edm::bit_cast<float>(i32);
}
inline static uint16_t float32to16(float x) { return float32to16round(x); }
/// Fast implementation, but it crops the number so it biases low
inline static uint16_t float32to16crop(float x) {
uint32_t i32 = edm::bit_cast<uint32_t>(x);
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
/// Slower implementation, but it rounds to avoid biases
inline static uint16_t float32to16round(float x) {
uint32_t i32 = edm::bit_cast<uint32_t>(x);
uint8_t shift = shifttable[(i32 >> 23) & 0x1ff];
if (shift == 13) {
uint16_t base2 = (i32 & 0x007fffff) >> 12;
uint16_t base = base2 >> 1;
if (((base2 & 1) != 0) && (base < 1023))
base++;
return basetable[(i32 >> 23) & 0x1ff] + base;
} else {
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
}
template <int bits>
inline static float reduceMantissaToNbits(const float &f) {
static_assert(bits <= 23, "max mantissa size is 23 bits");
constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
uint32_t i32 = edm::bit_cast<uint32_t>(f);
i32 &= mask;
return edm::bit_cast<float>(i32);
}
inline static float reduceMantissaToNbits(const float &f, int bits) {
uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
uint32_t i32 = edm::bit_cast<uint32_t>(f);
i32 &= mask;
return edm::bit_cast<float>(i32);
}
class ReduceMantissaToNbitsRounding {
public:
#ifdef CMS_UNDEFINED_SANITIZER
//Supress UBSan runtime error about -ve shift. This happens when bits==23
__attribute__((no_sanitize("shift")))
#endif
ReduceMantissaToNbitsRounding(int bits)
: shift(23 - bits), mask((0xFFFFFFFF >> (shift)) << (shift)), test(1 << (shift - 1)), maxn((1 << bits) - 2) {
assert(bits <= 23); // "max mantissa size is 23 bits"
}
float operator()(float f) const {
constexpr uint32_t low23 = (0x007FFFFF); // mask to keep lowest 23 bits = mantissa
constexpr uint32_t hi9 = (0xFF800000); // mask to keep highest 9 bits = the rest
uint32_t i32 = edm::bit_cast<uint32_t>(f);
if (i32 & test) { // need to round
uint32_t mantissa = (i32 & low23) >> shift;
if (mantissa < maxn)
mantissa++;
i32 = (i32 & hi9) | (mantissa << shift);
} else {
i32 &= mask;
}
return edm::bit_cast<float>(i32);
}
private:
const int shift;
const uint32_t mask, test, maxn;
};
template <int bits>
inline static float reduceMantissaToNbitsRounding(const float &f) {
static const ReduceMantissaToNbitsRounding reducer(bits);
return reducer(f);
}
inline static float reduceMantissaToNbitsRounding(float f, int bits) {
return ReduceMantissaToNbitsRounding(bits)(f);
}
template <typename InItr, typename OutItr>
static void reduceMantissaToNbitsRounding(int bits, InItr begin, InItr end, OutItr out) {
std::transform(begin, end, out, ReduceMantissaToNbitsRounding(bits));
}
inline static float max() {
constexpr uint32_t i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
return edm::bit_cast<float>(i32);
}
// Maximum float32 value that gets rounded to max()
inline static float max32RoundedToMax16() {
// 2^16 in float32 is the first to result inf in float16, so
// 2^16-1 is the last float32 to result max() in float16
constexpr uint32_t i32 = (0x8f << 23) - 1;
return edm::bit_cast<float>(i32);
}
inline static float min() {
constexpr uint32_t i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
return edm::bit_cast<float>(i32);
}
// Minimum float32 value that gets rounded to min()
inline static float min32RoundedToMin16() {
// 2^-14-1 in float32 is the first to result denormalized in float16, so
// 2^-14 is the first float32 to result min() in float16
constexpr uint32_t i32 = (0x71 << 23);
return edm::bit_cast<float>(i32);
}
inline static float denorm_min() {
constexpr uint32_t i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
return edm::bit_cast<float>(i32);
}
inline static bool isdenorm(uint16_t h) {
// if exponent is zero (sign-bit excluded of course) and mantissa is not zero
return ((h >> 10) & 0x1f) == 0 && (h & 0x3ff) != 0;
}
private:
CMS_THREAD_SAFE static uint32_t mantissatable[2048];
CMS_THREAD_SAFE static uint32_t exponenttable[64];
CMS_THREAD_SAFE static uint16_t offsettable[64];
CMS_THREAD_SAFE static uint16_t basetable[512];
CMS_THREAD_SAFE static uint8_t shifttable[512];
static void filltables();
};
#endif
|