Math/test/testMiniFloat.cpp

0001 #include <cppunit/extensions/HelperMacros.h>
0002 #include <iostream>
0003
0004 #include "DataFormats/Math/interface/libminifloat.h"
0005 #include "FWCore/Utilities/interface/isFinite.h"
0006
0007 class testMiniFloat : public CppUnit::TestFixture {
0008   CPPUNIT_TEST_SUITE(testMiniFloat);
0009
0010   CPPUNIT_TEST(testIsDenorm);
0011   CPPUNIT_TEST(testMax);
0012   CPPUNIT_TEST(testMax32RoundedToMax16);
0013   CPPUNIT_TEST(testMin);
0014   CPPUNIT_TEST(testMin32RoundedToMin16);
0015   CPPUNIT_TEST(testDenormMin);
0016
0017   CPPUNIT_TEST_SUITE_END();
0018
0019 public:
0020   void setUp() {}
0021   void tearDown() {}
0022
0023   void testIsDenorm();
0024   void testMax();
0025   void testMax32RoundedToMax16();
0026   void testMin();
0027   void testMin32RoundedToMin16();
0028   void testDenormMin();
0029
0030 private:
0031 };
0032
0033 CPPUNIT_TEST_SUITE_REGISTRATION(testMiniFloat);
0034
0035 void testMiniFloat::testIsDenorm() {
0036   // all float16s with zero exponent and non-zero mantissa are denormals, test here the boundaries
0037   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(1));
0038   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(1 | (1 << 15)));  // negative 1
0039   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(0x3ff));
0040   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(0x3ff) | (1 << 15));  // negative full-1 mantissa
0041
0042   // Test also boundary cases for float16 not being denormal
0043   CPPUNIT_ASSERT(!MiniFloatConverter::isdenorm(0));
0044   CPPUNIT_ASSERT(!MiniFloatConverter::isdenorm(0x400));              // exponent 1, zero mantissa
0045   CPPUNIT_ASSERT(!MiniFloatConverter::isdenorm(0x400 | (1 << 15)));  // negative exponent 1, zero mantissa
0046 }
0047
0048 void testMiniFloat::testMax() {
0049   // 0x1f exponent is for inf, so 0x1e is the maximum
0050   // in maximum mantissa all bits are 1
0051   const uint16_t minifloatmax = (0x1e << 10) | 0x3ff;
0052   CPPUNIT_ASSERT(MiniFloatConverter::max() == MiniFloatConverter::float16to32(minifloatmax));
0053
0054   // adding 1ulp(16) to max should give inf
0055   const uint16_t minifloatinf = minifloatmax + 1;
0056   CPPUNIT_ASSERT(edm::isNotFinite(MiniFloatConverter::float16to32(minifloatinf)));
0057 }
0058
0059 void testMiniFloat::testMax32RoundedToMax16() {
0060   // max32RoundedToMax16() -> float16 -> float32 should give max()
0061   CPPUNIT_ASSERT(MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(
0062                      MiniFloatConverter::max32RoundedToMax16())) == MiniFloatConverter::max());
0063
0064   // max32RoundedToMax16() + 1ulp(32) should give inf(16)
0065   union {
0066     float flt;
0067     uint32_t i32;
0068   } conv;
0069   conv.flt = MiniFloatConverter::max32RoundedToMax16();
0070   conv.i32 += 1;
0071   const float max32PlusUlp32RoundedTo16 = MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(conv.flt));
0072   CPPUNIT_ASSERT(edm::isNotFinite(max32PlusUlp32RoundedTo16));
0073 }
0074
0075 void testMiniFloat::testMin() {
0076   // 1 exponent, and 0 mantissa gives the smallest non-denormalized number of float16
0077   CPPUNIT_ASSERT(MiniFloatConverter::min() == MiniFloatConverter::float16to32(1 << 10));
0078
0079   // subtracting 1ulp(16) from min should give denormalized float16
0080   const uint16_t minifloat_denorm = MiniFloatConverter::float32to16(MiniFloatConverter::min()) - 1;
0081   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(minifloat_denorm));
0082
0083   // subtracking 1ulp(32) from min should also give denormalized float16 (both crop and round)
0084   union {
0085     float flt;
0086     uint32_t i32;
0087   } conv;
0088   conv.flt = MiniFloatConverter::min();
0089   conv.i32 -= 1;
0090   const uint16_t min32MinusUlp32CroppedTo16 = MiniFloatConverter::float32to16crop(conv.flt);
0091   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(min32MinusUlp32CroppedTo16));
0092   const uint16_t min32MinusUlp32RoundedTo16 = MiniFloatConverter::float32to16round(conv.flt);
0093   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(min32MinusUlp32RoundedTo16));
0094 }
0095
0096 void testMiniFloat::testMin32RoundedToMin16() {
0097   // min32RoundedToMin16() -> float16 -> float32 should be the same as min()
0098   CPPUNIT_ASSERT(MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(
0099                      MiniFloatConverter::min32RoundedToMin16())) == MiniFloatConverter::min());
0100
0101   // min32RoundedToMax16() - 1ulp(32) should give denormalized float16
0102   union {
0103     float flt;
0104     uint32_t i32;
0105   } conv;
0106   conv.flt = MiniFloatConverter::min32RoundedToMin16();
0107   conv.i32 -= 1;
0108   const uint16_t min32MinusUlp32RoundedTo16 = MiniFloatConverter::float32to16(conv.flt);
0109   CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(min32MinusUlp32RoundedTo16));
0110 }
0111
0112 void testMiniFloat::testDenormMin() {
0113   // zero exponent, and 0x1 in mantissa gives the smallest number of float16
0114   CPPUNIT_ASSERT(MiniFloatConverter::denorm_min() == MiniFloatConverter::float16to32(1));
0115
0116   // subtracting 1ulp(16) from denorm_min should give 0 float32
0117   CPPUNIT_ASSERT(
0118       MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(MiniFloatConverter::denorm_min()) - 1) == 0.f);
0119
0120   // subtracking 1ulp(32) from denorm_min should also give 0 float32
0121   union {
0122     float flt;
0123     uint32_t i32;
0124   } conv;
0125   conv.flt = MiniFloatConverter::denorm_min();
0126   conv.i32 -= 1;
0127   const float min32MinusUlp32RoundedTo16 =
0128       MiniFloatConverter::float16to32(MiniFloatConverter::float32to16round(conv.flt));
0129   CPPUNIT_ASSERT(min32MinusUlp32RoundedTo16 == 0.f);
0130   const float min32MinusUlp32CroppedTo16 =
0131       MiniFloatConverter::float16to32(MiniFloatConverter::float32to16crop(conv.flt));
0132   CPPUNIT_ASSERT(min32MinusUlp32CroppedTo16 == 0.f);
0133 }