Skip to content

Latest commit

 

History

History
185 lines (112 loc) · 4.12 KB

File metadata and controls

185 lines (112 loc) · 4.12 KB

toFloat16

Convert a single-precision floating-point number to the nearest half-precision floating-point number.

Usage

var float32ToFloat16 = require( '@stdlib/number/float32/base/to-float16' );

float32ToFloat16( x )

Converts a single-precision floating-point number to the nearest half-precision floating-point number.

var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' );

var y = float32ToFloat16( float64ToFloat32( 1.337 ) );
// returns 1.3369140625

Examples

var uniform = require( '@stdlib/random/array/uniform' );
var pickArguments = require( '@stdlib/utils/pick-arguments' );
var logEachMap = require( '@stdlib/console/log-each-map' );
var float32ToFloat16 = require( '@stdlib/number/float32/base/to-float16' );

// Generate an array of random numbers:
var f32 = uniform( 100, 0.0, 100.0, {
    'dtype': 'float32'
});

// Convert each single-precision floating-point number to the nearest half-precision floating-point number:
logEachMap( 'float32: %f => float16: %f', f32, pickArguments( float32ToFloat16, [ 1 ] ) );

C APIs

Usage

#include "stdlib/number/float32/base/to_float16.h"

stdlib_base_float32_to_float16( x )

Converts a single-precision floating-point number to the nearest half-precision floating-point number.

#include "stdlib/number/float16/ctor.h"

stdlib_float16_t x = stdlib_base_float32_to_float16( 3.14f );

The function accepts the following arguments:

  • x: [in] float input value.
stdlib_float16_t stdlib_base_float32_to_float16( const float x );

Examples

#include "stdlib/number/float32/base/to_float16.h"
#include "stdlib/number/float16/ctor.h"
#include <stdint.h>
#include <stdio.h>

int main( void ) {
    const float x[] = { 3.14f, -3.14f, 0.0f, 0.0f/0.0f };

    stdlib_float16_t v;
    int i;
    for ( i = 0; i < 4; i++ ) {
        v = stdlib_base_float32_to_float16( x[ i ] );
        printf( "%f => uint16: %d\n", x[ i ], stdlib_float16_to_bits( v ) );
    }
}