Function Binding and Performance Measurement

Introduction

Recently, I realized that I had not been writing clean and elegant code to measure the performance of functions in my previous articles.

In this blog post, I would like to quickly show how to implement reusable performance measurement code in C++, CUDA, and Python.

Performance Measurement

In the following examples, I created three equivalent implementations in C++, CUDA, and Python, respectively.

C++

The key is to use std::function and std::bind to bind functions and input parameters before performance measurement.

bind.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>
#include <thread>
#include <tuple>
#include <utility>
#include <vector>

template <class T>
float measure_performance(std::function<T(void)> bound_function,
size_t num_repeats = 100, size_t num_warmups = 100)
{
for (size_t i{0}; i < num_warmups; ++i)
{
bound_function();
}

std::chrono::steady_clock::time_point time_start{
std::chrono::steady_clock::now()};
for (size_t i{0}; i < num_repeats; ++i)
{
bound_function();
}
std::chrono::steady_clock::time_point time_end{
std::chrono::steady_clock::now()};

auto time_elapsed{std::chrono::duration_cast<std::chrono::milliseconds>(
time_end - time_start)
.count()};
float latency{time_elapsed / static_cast<float>(num_repeats)};

return latency;
}

double test_function(double a, double const& b)
{
std::this_thread::sleep_for(std::chrono::milliseconds(10));
return a + b;
}

int main()
{
constexpr size_t num_repeats{10};
constexpr size_t num_warmups{10};

std::vector<std::function<double(void)>> test_functions;
std::vector<std::tuple<double, double>> test_function_inputs{
{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}};

for (auto const& inputs : test_function_inputs)
{
std::function<double(void)> function{
std::bind(test_function, std::get<0>(inputs), std::get<1>(inputs))};
test_functions.push_back(function);
}

for (auto const& function : test_functions)
{
float latency{measure_performance(function, num_repeats, num_warmups)};
std::cout << std::fixed << std::setprecision(3)
<< "Latency: " << latency << " ms" << std::endl;
}
}
1
2
3
4
5
$ g++ bind.cpp -o bind --std c++14
$ ./bind
Latency: 10.100 ms
Latency: 10.100 ms
Latency: 10.100 ms

Python

Similar to the C++ version, the key is to use functools.partial to bind functions and input parameters before performance measurement.

bind.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import time
import functools
from timeit import default_timer as timer
from datetime import timedelta


def measure_performance(bound_function: functools.partial,
num_repeats: int = 100,
num_warmups: int = 100) -> float:

for _ in range(num_warmups):
bound_function()

time_start = timer()
for _ in range(num_repeats):
bound_function()
time_end = timer()

time_elapsed = timedelta(seconds=time_end -
time_start).total_seconds() * 1000

latency = time_elapsed / num_repeats

return latency


def test_function(a: float, b: float) -> float:

time.sleep(10 / 1000)

return a + b


if __name__ == "__main__":

num_repeats = 10
num_warmups = 10

test_functions = []
test_function_inputs = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]

for inputs in test_function_inputs:
function = functools.partial(test_function, inputs[0], inputs[1])
test_functions.append(function)

for function in test_functions:
latency = measure_performance(bound_function=function,
num_repeats=num_repeats,
num_warmups=num_warmups)
print(f"Latency: {latency:.3f} ms")
1
2
3
4
$ python bind.py
Latency: 10.081 ms
Latency: 10.070 ms
Latency: 10.090 ms

CUDA

Similar to the C++ example, we also use std::function and std::bind to bind functions and input parameters before performance measurement. However, to measure the latency using CUDA event, we will need to know which CUDA stream the kernel is launched on and synchronize accordingly.

bind.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>
#include <thread>
#include <tuple>
#include <utility>
#include <vector>

#include <cuda_runtime.h>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
void check(cudaError_t err, const char* const func, const char* const file,
const int line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(const char* const file, const int line)
{
cudaError_t const err{cudaGetLastError()};
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << std::endl;
std::exit(EXIT_FAILURE);
}
}

template <class T>
float measure_performance(std::function<T(cudaStream_t)> bound_function,
cudaStream_t stream, size_t num_repeats = 100,
size_t num_warmups = 100)
{
cudaEvent_t start, stop;
float time;

CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));

for (size_t i{0}; i < num_warmups; ++i)
{
bound_function(stream);
}

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));

CHECK_CUDA_ERROR(cudaEventRecord(start, stream));
for (size_t i{0}; i < num_repeats; ++i)
{
bound_function(stream);
}
CHECK_CUDA_ERROR(cudaEventRecord(stop, stream));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
CHECK_LAST_CUDA_ERROR();
CHECK_CUDA_ERROR(cudaEventElapsedTime(&time, start, stop));
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));

float const latency{time / num_repeats};

return latency;
}

__global__ void float_addition(float* output, float const* input_1,
float const* input_2, size_t n)
{
size_t const idx{blockDim.x * blockIdx.x + threadIdx.x};
size_t const stride{blockDim.x * gridDim.x};
for (size_t i{idx}; i < n; i += stride)
{
output[i] = input_1[i] + input_2[i];
}
}

void launch_float_addition(float* output, float const* input_1,
float const* input_2, size_t n,
cudaStream_t stream)
{
dim3 const threads_per_block{1024};
dim3 const blocks_per_grid{32};
float_addition<<<blocks_per_grid, threads_per_block, 0, stream>>>(
output, input_1, input_2, n);
CHECK_LAST_CUDA_ERROR();
}

int main()
{
constexpr size_t num_repeats{10};
constexpr size_t num_warmups{10};

constexpr size_t n{1000000};
constexpr size_t num_test_functions{3};
cudaStream_t stream;

float *d_input_1, *d_input_2, *d_output;

CHECK_CUDA_ERROR(cudaMalloc(&d_input_1, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_input_2, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_output, n * sizeof(float)));

CHECK_CUDA_ERROR(cudaStreamCreate(&stream));

std::vector<std::function<void(cudaStream_t)>> test_functions;

for (int i{0}; i < num_test_functions; ++i)
{
std::function<void(cudaStream_t)> function{
std::bind(launch_float_addition, d_output, d_input_1, d_input_2, n,
std::placeholders::_1)};
test_functions.push_back(function);
}

for (auto const& function : test_functions)
{
float const latency{
measure_performance(function, stream, num_repeats, num_warmups)};
std::cout << std::fixed << std::setprecision(3)
<< "Latency: " << latency << " ms" << std::endl;
}

CHECK_CUDA_ERROR(cudaFree(d_input_1));
CHECK_CUDA_ERROR(cudaFree(d_input_2));
CHECK_CUDA_ERROR(cudaFree(d_output));
CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
}
1
2
3
4
5
$ nvcc bind.cu -o bind -std=c++14
$ ./bind
Latency: 0.025 ms
Latency: 0.026 ms
Latency: 0.025 ms

Note that I have seen building failure building this program using the nvcc compiler installed on the native host system. But there is no problem building this program using the nvcc compiler inside the NVIDIA NGC CUDA Docker container.

References

Author

Lei Mao

Posted on

04-07-2022

Updated on

12-15-2023

Licensed under


Comments