testMiniFloat

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
#include <cppunit/extensions/HelperMacros.h>
#include <iostream>

#include "DataFormats/Math/interface/libminifloat.h"
#include "FWCore/Utilities/interface/isFinite.h"

class testMiniFloat : public CppUnit::TestFixture {
  CPPUNIT_TEST_SUITE(testMiniFloat);

  CPPUNIT_TEST(testIsDenorm);
  CPPUNIT_TEST(testMax);
  CPPUNIT_TEST(testMax32RoundedToMax16);
  CPPUNIT_TEST(testMin);
  CPPUNIT_TEST(testMin32RoundedToMin16);
  CPPUNIT_TEST(testDenormMin);

  CPPUNIT_TEST_SUITE_END();

public:
  void setUp() {}
  void tearDown() {}

  void testIsDenorm();
  void testMax();
  void testMax32RoundedToMax16();
  void testMin();
  void testMin32RoundedToMin16();
  void testDenormMin();

private:
};

CPPUNIT_TEST_SUITE_REGISTRATION(testMiniFloat);

void testMiniFloat::testIsDenorm() {
  // all float16s with zero exponent and non-zero mantissa are denormals, test here the boundaries
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(1));
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(1 | (1 << 15)));  // negative 1
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(0x3ff));
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(0x3ff) | (1 << 15));  // negative full-1 mantissa

  // Test also boundary cases for float16 not being denormal
  CPPUNIT_ASSERT(!MiniFloatConverter::isdenorm(0));
  CPPUNIT_ASSERT(!MiniFloatConverter::isdenorm(0x400));              // exponent 1, zero mantissa
  CPPUNIT_ASSERT(!MiniFloatConverter::isdenorm(0x400 | (1 << 15)));  // negative exponent 1, zero mantissa
}

void testMiniFloat::testMax() {
  // 0x1f exponent is for inf, so 0x1e is the maximum
  // in maximum mantissa all bits are 1
  const uint16_t minifloatmax = (0x1e << 10) | 0x3ff;
  CPPUNIT_ASSERT(MiniFloatConverter::max() == MiniFloatConverter::float16to32(minifloatmax));

  // adding 1ulp(16) to max should give inf
  const uint16_t minifloatinf = minifloatmax + 1;
  CPPUNIT_ASSERT(edm::isNotFinite(MiniFloatConverter::float16to32(minifloatinf)));
}

void testMiniFloat::testMax32RoundedToMax16() {
  // max32RoundedToMax16() -> float16 -> float32 should give max()
  CPPUNIT_ASSERT(MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(
                     MiniFloatConverter::max32RoundedToMax16())) == MiniFloatConverter::max());

  // max32RoundedToMax16() + 1ulp(32) should give inf(16)
  union {
    float flt;
    uint32_t i32;
  } conv;
  conv.flt = MiniFloatConverter::max32RoundedToMax16();
  conv.i32 += 1;
  const float max32PlusUlp32RoundedTo16 = MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(conv.flt));
  CPPUNIT_ASSERT(edm::isNotFinite(max32PlusUlp32RoundedTo16));
}

void testMiniFloat::testMin() {
  // 1 exponent, and 0 mantissa gives the smallest non-denormalized number of float16
  CPPUNIT_ASSERT(MiniFloatConverter::min() == MiniFloatConverter::float16to32(1 << 10));

  // subtracting 1ulp(16) from min should give denormalized float16
  const uint16_t minifloat_denorm = MiniFloatConverter::float32to16(MiniFloatConverter::min()) - 1;
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(minifloat_denorm));

  // subtracking 1ulp(32) from min should also give denormalized float16 (both crop and round)
  union {
    float flt;
    uint32_t i32;
  } conv;
  conv.flt = MiniFloatConverter::min();
  conv.i32 -= 1;
  const uint16_t min32MinusUlp32CroppedTo16 = MiniFloatConverter::float32to16crop(conv.flt);
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(min32MinusUlp32CroppedTo16));
  const uint16_t min32MinusUlp32RoundedTo16 = MiniFloatConverter::float32to16round(conv.flt);
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(min32MinusUlp32RoundedTo16));
}

void testMiniFloat::testMin32RoundedToMin16() {
  // min32RoundedToMin16() -> float16 -> float32 should be the same as min()
  CPPUNIT_ASSERT(MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(
                     MiniFloatConverter::min32RoundedToMin16())) == MiniFloatConverter::min());

  // min32RoundedToMax16() - 1ulp(32) should give denormalized float16
  union {
    float flt;
    uint32_t i32;
  } conv;
  conv.flt = MiniFloatConverter::min32RoundedToMin16();
  conv.i32 -= 1;
  const uint16_t min32MinusUlp32RoundedTo16 = MiniFloatConverter::float32to16(conv.flt);
  CPPUNIT_ASSERT(MiniFloatConverter::isdenorm(min32MinusUlp32RoundedTo16));
}

void testMiniFloat::testDenormMin() {
  // zero exponent, and 0x1 in mantissa gives the smallest number of float16
  CPPUNIT_ASSERT(MiniFloatConverter::denorm_min() == MiniFloatConverter::float16to32(1));

  // subtracting 1ulp(16) from denorm_min should give 0 float32
  CPPUNIT_ASSERT(
      MiniFloatConverter::float16to32(MiniFloatConverter::float32to16(MiniFloatConverter::denorm_min()) - 1) == 0.f);

  // subtracking 1ulp(32) from denorm_min should also give 0 float32
  union {
    float flt;
    uint32_t i32;
  } conv;
  conv.flt = MiniFloatConverter::denorm_min();
  conv.i32 -= 1;
  const float min32MinusUlp32RoundedTo16 =
      MiniFloatConverter::float16to32(MiniFloatConverter::float32to16round(conv.flt));
  CPPUNIT_ASSERT(min32MinusUlp32RoundedTo16 == 0.f);
  const float min32MinusUlp32CroppedTo16 =
      MiniFloatConverter::float16to32(MiniFloatConverter::float32to16crop(conv.flt));
  CPPUNIT_ASSERT(min32MinusUlp32CroppedTo16 == 0.f);
}