manipulating IEEE754 32 bit floats + 64 bit double trick

Last Modified: March 29, 2014, at 08:34 AM
By: robtillaart
Platforms: UNO (others not tested)

The functions presented here are experimental and possibly do not implement all the details of IEEE754, so use with care.

remarks & comments

Intro

This page describes a number of functions and code snippets to manipulate IEEE754 32 bit floats. It contains functions to extract sign, exponent and mantisse so one can manipulate floats on bit level. Also code is provided to slash CPU cycles from some elementary float functions.

One of the remarks often made on the Arduino is that it does not support IEEE754 64 bit doubles. Two functions on this page allow to pack and unpack a 32bit float into a 8byte array which represents a IEEE754 64bit double. This allows the exchange of 64bit doubles to and from Arduino e.g. over Serial with conversion done at Arduino side.

In the current implementation the code is presented as plain utility functions. These might become wrapped into Classes in the future.

Data types

The first functionality the lib offers is IEEE754 datatypes. Every float exists of three parts. The sign, the exponent and the mantisse. For a full description please see: 32 bit float -- 64 bit double

struct IEEEfloat;
struct IEEEdouble;
struct _DBL;        // Arduino variant for IEEEdouble 

union _FLOATCONV;   // mapping float to other representations
union _DBLCONV;     // mapping _DBL to other representations

There are two functions for debugging. There is no dump function for a double (yet).

void dumpFloat(float number);
void dumpDBL(struct _DBL dbl);

Conversion functions

There are two functions to convert a 32bit float into a packed byte array which represents a double. This can be used e.g. to send a float to a PC where it is received a a 64bit double or the other way around. As 64 bit has more significant digits the conversion from 32->64 bit will work quite well, but from 64->32 will loose significant digits and can be to large to be represented in a 32 bit. In this latter case NAN will be returned.

Note that these conversion functions are experimental.

void float2DoublePacked(float number, byte* bar, int byteOrder=LSBFIRST);
float doublePacked2Float(byte* bar, int byteOrder=LSBFIRST)

Arduino sends 64 bit double to Python

Note the communication is binary, not text-mode

Arduino send sketch

//
//    FILE: sendDouble.ino
//  AUTHOR: Rob Tillaart
// VERSION: 0.1.00
// PURPOSE: sends an expanded float as double to PC.
//
// Released to the public domain
//

#include <IEEE754tools.h>

void setup()
{
  Serial.begin(115200);
}

void loop()
{
  float f = analogRead(A0) * 5.0 / 1024;
  sendDouble(f);
  delay(100);

  sendDouble(PI);  // reference
  delay(100);
}

void sendDouble(float number)
{
  byte x[8] = {
    0,0,0,0, 0,0,0,0      };

  float2DoublePacked(number, x);
  // simple dump, no handshake or packetizing
  for (int i=0; i<8;i++)
    Serial.write(x[i]);
}
// END OF FILE

Python receive script

import serial
from struct import unpack

ser = serial.Serial("COM31", 115200)

def h():
    return unpack('d', ser.read(8))

def main():
    ser.close()
    ser.open()
    while(True):
	print ser.inWaiting()
        print h()

if __name__ == '__main__':
    main()
# END OF FILE

Arduino echo's 64 bit double from Python

Note the communication is binary, not text-mode

Arduino echo sketch

//
//    FILE: echoDouble.ino
//  AUTHOR: Rob Tillaart
// VERSION: 0.1.00
// PURPOSE: sends an expanded float as double to PC.
//
// Released to the public domain
//

#include <IEEE754tools.h>

void setup()
{
  Serial.begin(115200);
}

void loop()
{
  float f = receiveDouble();
  sendDouble(f+1);
  delay(100);
}

float receiveDouble()
{
  byte x[8];
  // wait for 8 bytes
  while (Serial.available() < 8);
  for (int i=0; i<8;i++)
    x[i] = Serial.read();
  return doublePacked2Float(x);
}

void sendDouble(float number)
{
  byte x[8] = {
    0,0,0,0, 0,0,0,0          };

  float2DoublePacked(number, x);
  // simple dump, no handshake or packetizing
  for (int i=0; i<8;i++)
    Serial.write(x[i]);
}
// END OF FILE

Python echo script

import serial
from struct import pack, unpack
from math import pi
import time

ser = serial.Serial("COM31", 115200)

def getDouble():
	return unpack('d', ser.read(8))

def sendDouble(f):
	x = pack('d', f)
	for i in range(8):
		ser.write(x[i])

def main():
	ser.close()
	ser.open()
	# give the Arduino some time to 'boot'
	time.sleep(2.0)
	while(True):
		sendDouble(pi)
		print getDouble()

if __name__ == '__main__':
	main()
# END OF FILE

Notes

To use the library, make a folder in your SKETCHBOOKPATH\libaries with the name IEEE754tools and put the IEEE754tools.h there.

Todo

  • Test test test test ...
  • Fast code to test NAN, -INF and +INF.

Enjoy tinkering,

rob.tillaart@removethisgmail.com

History

  • 2013-09-08 Initial version,

IEEE754tools.h file

//
//    FILE: IEEE754tools.h
//  AUTHOR: Rob Tillaart
// VERSION: 0.1.00
// PURPOSE: IEEE754 tools
//
// http://playground.arduino.cc//Main/IEEE754tools
//
// Released to the public domain
// not tested, use with care
//

#ifndef IEEE754tools_h
#define IEEE754tools_h


#if defined(ARDUINO) && ARDUINO >= 100
#include "Arduino.h"
#else
#include "WProgram.h"
#endif

// (un)comment lines to configure functionality / size
//#define IEEE754_ENABLE_MSB
#define IEEE754_ENABLE_DUMP

// IEEE754 float layout; 
struct IEEEfloat
{
    uint32_t m:23; 
    uint8_t e:8;
    uint8_t s:1;
};

// IEEE754 double layout; 
struct IEEEdouble
{
    uint64_t m:52; 
    uint16_t e:11;
    uint8_t s:1;
};

// Arduino UNO double layout: 
// the UNO has no 64 bit double, it is only able to map 23 bits of the mantisse
// a filler is added.
struct _DBL
{
    uint32_t filler:29;
    uint32_t m:23;
    uint16_t e:11;
    uint8_t  s:1;
};

// for packing and unpacking a float
typedef union _FLOATCONV
{
    IEEEfloat p;
    float f;
    byte b[4];
} _FLOATCONV;

// for packing and unpacking a double
typedef union _DBLCONV
{
    // IEEEdouble p;
    _DBL p;
    double d;           // !! is a 32bit float for UNO.
    byte b[4];
} _DBLCONV;


#ifdef IEEE754_ENABLE_DUMP
// print float components
void dumpFloat(float number)
{
    IEEEfloat* x = (IEEEfloat*) ((void*)&number);
    Serial.print(x->s, HEX);
    Serial.print("\t");
    Serial.print(x->e, HEX);
    Serial.print("\t");
    Serial.println(x->m, HEX);

    // Serial.print(" sign: "); Serial.print(x->s);
    // Serial.print("  exp: "); Serial.print(x->e);
    // Serial.print(" mant: "); Serial.println(x->m);
}

// print "double" components
void dumpDBL(struct _DBL dbl)
{
    Serial.print(dbl.s, HEX);
    Serial.print("\t");
    Serial.print(dbl.e, HEX);
    Serial.print("\t");
    Serial.println(dbl.m, HEX);
}
#endif


//
// converts a float to a packed array of 8 bytes representing a 64 bit double
// restriction exponent and mantisse.
//
// float;  array of 8 bytes;  LSBFIRST; MSBFIRST
//
void float2DoublePacked(float number, byte* bar, int byteOrder=LSBFIRST)  
{
    _FLOATCONV fl;
    fl.f = number;
    _DBLCONV dbl;
    dbl.p.s = fl.p.s;
    dbl.p.e = fl.p.e-127 +1023;  // exponent adjust
    dbl.p.m = fl.p.m;

#ifdef IEEE754_ENABLE_MSB
    if (byteOrder == LSBFIRST)
    {
#endif
        for (int i=0; i<8; i++)
        {
            bar[i] = dbl.b[i];
        }
#ifdef IEEE754_ENABLE_MSB
    }
    else
    {
        for (int i=0; i<8; i++)
        {
            bar[i] = dbl.b[7-i];
        }
    }
#endif
}

//
// converts a packed array of bytes into a 32bit float.
// there can be an exponent overflow
// the mantisse is truncated to 23 bits.
//
float doublePacked2Float(byte* bar, int byteOrder=LSBFIRST)
{
    _FLOATCONV fl;
    _DBLCONV dbl;

#ifdef IEEE754_ENABLE_MSB
    if (byteOrder == LSBFIRST)
    {
#endif
        for (int i=0; i<8; i++)
        {
            dbl.b[i] = bar[i];
        }
#ifdef IEEE754_ENABLE_MSB
    }
    else
    {
        for (int i=0; i<8; i++)
        {
            dbl.b[i] = bar[7-i];
        }
    }
#endif

    int e = dbl.p.e-1023+127;  // exponent adjust 
    // TODO check exponent overflow.
    if (e >=0 && e <= 255) 
    {
        fl.p.s = dbl.p.s;
        fl.p.e = e;  
        fl.p.m = dbl.p.m;  // note this one clips the mantisse 
    }
    else fl.f = NAN;

    return fl.f;
}

//
// NOT TESTED FUNCTIONS
// 
uint8_t getSign(float number)
{
  IEEEfloat* x = (IEEEfloat*) ((void*)&number);
  return x->s;
}

int getExponent(float number)
{
  IEEEfloat* x = (IEEEfloat*) ((void*)&number);
  return x->e - 127;
}

uint32_t getMantisse(float number)
{
  IEEEfloat* x = (IEEEfloat*) ((void*)&number);
  return x->m;
}

/*
// ONELINERS to speed up some specific 32 bit float math

// *(((byte*) &number)+3) &= 0x7F;              // number == fabs(number);
// x = *(((byte*) &number)+3) & 0x7F;           // x = fabs(number);
// GAIN = factor 2

// *(((byte*) &number)+3) |= 0x80;              // number = -fabs(number);
// x = *(((byte*) &number)+3) | 0x80;           // x = -fabs(number);
// GAIN = factor 2

// *(((byte*) &number)+3) ^= 0x80;              // number = -number;
// x = *(((byte*) &number)+3) ^ 0x80;           // x = -number;
// GAIN = factor 2

// s = *(((uint8_t*) &number)+3) & 0x80;        // s = sign(number);
// if ( *(((byte*) &number)+3) & 0x80) x=2;     // if (number < 0) x=2;
// GAIN = factor 5

int getExponent(float number)
{
    uint8_t e = (*(((uint8_t*) &number)+3) & 0x7F) << 1;
    if (*(((uint8_t*) &number)+2) & 0x80) e++;
    return e;
}

*/
#endif
// END OF FILE

Share