#include <iostream> // cout
#include <cstdint> // uint32_t, uint64_t
#include <cstring> // memcpy
#include <cmath> // nextafter
using namespace std;
//simulated fp16
class Float16 {
static const uint32_t mantissaShift = 42;
static const uint32_t expShiftMid = 56;
static const uint32_t expShiftOut = 52;
double dValue_;
public:
Float16(double in) : dValue_(in) {
uint64_t utmp;
memcpy(&utmp, &dValue_, sizeof utmp);
//zeroing mantissa bits starting from 11th (this is NOT rounding)
utmp = utmp >> mantissaShift;
utmp = utmp << mantissaShift;
//setting masks for 5-bit exponent extraction out of 11-bit one
const uint64_t maskExpMid = (63llu << expShiftMid);
const uint64_t maskExpOut = (15llu << expShiftOut);
const uint64_t maskExpLead = (1llu << 62);
const uint64_t maskMantissaD = (1llu << 63) + maskExpLead + maskExpMid + maskExpOut;
if (utmp & maskExpLead) {// checking leading bit, suspect overflow
if (utmp & maskExpMid) { //Detected overflow if at least 1 bit is non-zero
//Assign Inf with proper sign
utmp = utmp | maskExpMid; //setting 1s in the middle 6 bits of of exponent
utmp = utmp & maskMantissaD; //zeroing mantissa irrelative of original values to prevent NaN
utmp = utmp | maskExpOut; //setting 1s in the last 4 bits of exponent
}
} else { //checking small numbers according to exponent range
if ((utmp & maskExpMid) != maskExpMid) { //Detected underflow if at least 1 bit is 0
utmp = 0;
}
}
memcpy(&dValue_, &utmp, sizeof utmp);
}
explicit operator double() { return dValue_; }
};
class CorrectFloat16 {
static const uint32_t mantissaShift = 42;
static const uint32_t expShiftMid = 56;
static const uint32_t expShiftOut = 52;
double dValue_;
public:
CorrectFloat16(double in) : dValue_(in) {
uint64_t utmp;
memcpy(&utmp, &dValue_, sizeof utmp);
utmp = utmp >> mantissaShift;
utmp = utmp << mantissaShift;
const uint64_t maskExpMid = (63llu << expShiftMid);
const uint64_t maskExpOut = (15llu << expShiftOut);
const uint64_t maskExpLead = (1llu << 62);
const uint64_t maskMantissaD = (1llu << 63) + maskExpLead + maskExpMid + maskExpOut;
if (utmp & maskExpLead) {
if (utmp & maskExpMid || (utmp & maskExpOut) == maskExpOut) { // <- Changed here
utmp = utmp | maskExpMid;
utmp = utmp & maskMantissaD;
utmp = utmp | maskExpOut;
}
} else {
if ((utmp & maskExpMid) != maskExpMid) {
utmp = 0;
}
}
memcpy(&dValue_, &utmp, sizeof utmp);
}
explicit operator double() { return dValue_; }
};
#if __GNUC__ >= 13 && __cplusplus >= 202100L
#include <stdfloat> // float16_t
#else
typedef _Float16 float16_t; // GCC >= 12 || Clang >= 15
#endif
int main() {
cout.precision(17);
double SFP16_MAX = 0x1p17 - 0x1p-36; // 131071.99999999999;
double SFP16_INF = nextafter(SFP16_MAX, INFINITY);
cout << "Huawei FP16\n";
cout << " MAX\n";
cout << " fp64: " << defaultfloat << SFP16_MAX << " (" << hexfloat << SFP16_MAX << ")\n"
<< " fp16: " << defaultfloat << double(Float16(SFP16_MAX)) << " (" << hexfloat << double(Float16(SFP16_MAX)) << ")\n";
cout << " OVERFLOW\n";
cout << " fp64: " << defaultfloat << SFP16_INF << " (" << hexfloat << SFP16_INF << ")\n"
<< " fp16: " << defaultfloat << double(Float16(SFP16_INF)) << "\n";
cout << " RANGE\n";
cout << " fp64: (" << -SFP16_INF << "; " << SFP16_INF << ")\n"
<< " fp16: [" << double(Float16(-SFP16_MAX)) << "; " << double(Float16(SFP16_MAX)) << "]\n";
cout << "\n";
double FP16_MAX = 0x1p16 - 0x1p-37 - 0x1p4; // 65519.99999999999;
double FP16_INF = nextafter(FP16_MAX, INFINITY);
cout << "IEEE-754 FP16\n";
cout << " MAX\n";
cout << " fp64: " << defaultfloat << FP16_MAX << " (" << hexfloat << FP16_MAX << ")\n"
<< " fp16: " << defaultfloat << double(float16_t(FP16_MAX)) << " (" << hexfloat << double(float16_t(FP16_MAX)) << ")\n";
cout << " OVERFLOW\n";
cout << " fp64: " << defaultfloat << FP16_INF << " (" << hexfloat << FP16_INF << ")\n"
<< " fp16: " << defaultfloat << double(float16_t(FP16_INF)) << "\n";
cout << " RANGE\n";
cout << " fp64: (" << -FP16_INF << "; " << FP16_INF << ")\n"
<< " fp16: [" << double(float16_t(-FP16_MAX)) << "; " << double(float16_t(FP16_MAX)) << "]\n";
cout << "\n";
double CSFP16_MAX = 0x1p16 - 0x1p-37; // 65535.99999999999;
double CSFP16_INF = nextafter(CSFP16_MAX, INFINITY);
cout << "Huawei FP16 with correct range\n";
cout << " MAX\n";
cout << " fp64: " << defaultfloat << CSFP16_MAX << " (" << hexfloat << CSFP16_MAX << ")\n"
<< " fp16: " << defaultfloat << double(CorrectFloat16(CSFP16_MAX)) << " (" << hexfloat << double(CorrectFloat16(CSFP16_MAX)) << ")\n";
cout << " OVERFLOW\n";
cout << " fp64: " << defaultfloat << CSFP16_INF << " (" << hexfloat << CSFP16_INF << ")\n"
<< " fp16: " << defaultfloat << double(CorrectFloat16(CSFP16_INF)) << "\n";
cout << " RANGE\n";
cout << " fp64: (" << -CSFP16_INF << "; " << CSFP16_INF << ")\n"
<< " fp16: [" << double(CorrectFloat16(-CSFP16_MAX)) << "; " << double(CorrectFloat16(CSFP16_MAX)) << "]\n";
}