Floating Point Constant Values In C++, CUDA, and Python

Introduction

Floating-point constants, such as infinity, maximum, minimum, and zero, are essential for many numerical algorithms and scientific computations.

In this blog post, I would like to discuss how to use floating-point constant values in C++, CUDA, and Python.

Floating Point Constant Values In C++, CUDA, and Python

The usages and assertions of floating-point constants in C++, CUDA, and Python are demonstrated in the following examples.

C++ Example

floating_point.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#include <limits>
#include <stdfloat>

#include <cassert>
#include <cmath>

template <typename T>
constexpr bool stdfloat_has_special_values()
{
return std::numeric_limits<T>::has_infinity &&
std::numeric_limits<T>::has_quiet_NaN;
}

// Constexpr operator overloading for == by checking bitwise equality.
template <typename T>
constexpr bool operator==(T const& lhs, T const& rhs)
{
// Check if bitwise identical using & and ~.
return (lhs & ~rhs) == 0 && (rhs & ~lhs) == 0;
}

template <typename T>
constexpr bool operator!=(T const& lhs, T const& rhs)
{
return !(lhs == rhs);
}

int main()
{
static_assert(__STDCPP_FLOAT16_T__ == 1, "No float16 support");
static_assert(__STDCPP_FLOAT32_T__ == 1, "No float32 support");
static_assert(__STDCPP_FLOAT64_T__ == 1, "No float64 support");
static_assert(__STDCPP_FLOAT128_T__ == 1, "No float128 support");
static_assert(__STDCPP_BFLOAT16_T__ == 1, "No bfloat16 support");

static_assert(stdfloat_has_special_values<std::float16_t>(),
"No special values for float16");
static_assert(stdfloat_has_special_values<std::float32_t>(),
"No special values for float32");
static_assert(stdfloat_has_special_values<std::float64_t>(),
"No special values for float64");
static_assert(stdfloat_has_special_values<std::float128_t>(),
"No special values for float128");
static_assert(stdfloat_has_special_values<std::bfloat16_t>(),
"No special values for bfloat16");

static_assert(std::numeric_limits<std::float16_t>::infinity() >
std::numeric_limits<std::float16_t>::max(),
"No infinity for float16");
static_assert(std::numeric_limits<std::float32_t>::infinity() >
std::numeric_limits<std::float32_t>::max(),
"No infinity for float32");
static_assert(std::numeric_limits<std::float64_t>::infinity() >
std::numeric_limits<std::float64_t>::max(),
"No infinity for float64");
static_assert(std::numeric_limits<std::float128_t>::infinity() >
std::numeric_limits<std::float128_t>::max(),
"No infinity for float128");
static_assert(std::numeric_limits<std::bfloat16_t>::infinity() >
std::numeric_limits<std::bfloat16_t>::max(),
"No infinity for bfloat16");

static_assert(-std::numeric_limits<std::float16_t>::infinity() <
std::numeric_limits<std::float16_t>::lowest(),
"No negative infinity for float16");
static_assert(-std::numeric_limits<std::float32_t>::infinity() <
std::numeric_limits<std::float32_t>::lowest(),
"No negative infinity for float32");
static_assert(-std::numeric_limits<std::float64_t>::infinity() <
std::numeric_limits<std::float64_t>::lowest(),
"No negative infinity for float64");
static_assert(-std::numeric_limits<std::float128_t>::infinity() <
std::numeric_limits<std::float128_t>::lowest(),
"No negative infinity for float128");
static_assert(-std::numeric_limits<std::bfloat16_t>::infinity() <
std::numeric_limits<std::bfloat16_t>::lowest(),
"No negative infinity for bfloat16");

// std::exp is not a constexpr function.
assert(std::exp(-std::numeric_limits<std::float16_t>::infinity()) ==
0.0f16);
assert(std::exp(-std::numeric_limits<std::float32_t>::infinity()) ==
0.0f32);
assert(std::exp(-std::numeric_limits<std::float64_t>::infinity()) ==
0.0f64);
assert(std::exp(-std::numeric_limits<std::float128_t>::infinity()) ==
0.0f128);
assert(std::exp(-std::numeric_limits<std::bfloat16_t>::infinity()) ==
0.0bf16);
}
1
2
$ g++ floating_point.cpp -o floating_point_cpp -std=c++23
$ ./floating_point_cpp

CUDA Example

floating_point.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#include <cassert>

#include <cuda_bf16.h>
#include <cuda_fp16.h>

// https://docs.nvidia.com/cuda/archive/12.8.0/cuda-math-api/cuda_math_api/group__CUDA__MATH__INTRINSIC__HALF__CONSTANTS.html
constexpr unsigned short CUDART_INF_FP16_BITS = 0x7C00U;
constexpr unsigned short CUDART_MAX_NORMAL_FP16_BITS = 0x7BFFU;
constexpr unsigned short CUDART_MIN_DENORM_FP16_BITS = 0x0001U;
constexpr unsigned short CUDART_ZERO_FP16_BITS = 0x0000U;

// https://docs.nvidia.com/cuda/archive/12.8.0/cuda-math-api/cuda_math_api/group__CUDA__MATH__INTRINSIC__BFLOAT16__CONSTANTS.html
constexpr unsigned short CUDART_INF_BF16_BITS = 0x7F80U;
constexpr unsigned short CUDART_MAX_NORMAL_BF16_BITS = 0x7F7FU;
constexpr unsigned short CUDART_MIN_DENORM_BF16_BITS = 0x0001U;
constexpr unsigned short CUDART_ZERO_BF16_BITS = 0x0000U;

int main()
{
assert(CUDART_INF_FP16 ==
reinterpret_cast<__half const&>(CUDART_INF_FP16_BITS));
assert(CUDART_MAX_NORMAL_FP16 ==
reinterpret_cast<__half const&>(CUDART_MAX_NORMAL_FP16_BITS));
assert(CUDART_MIN_DENORM_FP16 ==
reinterpret_cast<__half const&>(CUDART_MIN_DENORM_FP16_BITS));
assert(CUDART_ZERO_FP16 ==
reinterpret_cast<__half const&>(CUDART_ZERO_FP16_BITS));
assert(CUDART_INF_FP16 > CUDART_MAX_NORMAL_FP16);
assert(-CUDART_INF_FP16 < -CUDART_MAX_NORMAL_FP16);

assert(CUDART_INF_BF16 ==
reinterpret_cast<__nv_bfloat16 const&>(CUDART_INF_BF16_BITS));
assert(CUDART_MAX_NORMAL_BF16 ==
reinterpret_cast<__nv_bfloat16 const&>(CUDART_MAX_NORMAL_BF16_BITS));
assert(CUDART_MIN_DENORM_BF16 ==
reinterpret_cast<__nv_bfloat16 const&>(CUDART_MIN_DENORM_BF16_BITS));
assert(CUDART_ZERO_BF16 ==
reinterpret_cast<__nv_bfloat16 const&>(CUDART_ZERO_BF16_BITS));
assert(CUDART_INF_BF16 > CUDART_MAX_NORMAL_BF16);
assert(-CUDART_INF_BF16 < -CUDART_MAX_NORMAL_BF16);
}
1
2
$ nvcc floating_point.cu -o floating_point_cuda
$ ./floating_point_cuda

Python Example

floating_point.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import numpy as np

if __name__ == "__main__":

float16_inf = np.float16("inf")
float32_inf = np.float32("inf")
float64_inf = np.float64("inf")

float16_max = np.finfo(np.float16).max
float32_max = np.finfo(np.float32).max
float64_max = np.finfo(np.float64).max

float16_min = np.finfo(np.float16).min
float32_min = np.finfo(np.float32).min
float64_min = np.finfo(np.float64).min

assert float16_inf > float16_max
assert float32_inf > float32_max
assert float64_inf > float64_max

assert -float16_inf < float16_min
assert -float32_inf < float32_min
assert -float64_inf < float64_min

assert np.exp(-float16_inf) == np.float16(0)
assert np.exp(-float32_inf) == np.float32(0)
assert np.exp(-float64_inf) == np.float64(0)
1
$ python floating_point.py

References

Floating Point Constant Values In C++, CUDA, and Python

https://leimao.github.io/blog/Floating-Point-Constant-Values-CPP-CUDA-Python/

Author

Lei Mao

Posted on

08-22-2025

Updated on

08-22-2025

Licensed under


Comments