Function Binding and Performance Measurement

Introduction

Recently, I realized that I had not been writing clean and elegant code to measure the performance of functions in my previous articles.

In this blog post, I would like to quickly show how to implement reusable performance measurement code in C++, CUDA, and Python.

Performance Measurement

In the following examples, I created two equivalent implementations in C++, CUDA, and Python, respectively.

C++

The key is to use std::function and std::bind to bind functions and input parameters before performance measurement.

bind.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>
#include <thread>
#include <tuple>
#include <utility>
#include <vector>

template <class T>
float measure_performance(std::function<T(void)> bound_function,
const int num_repeats = 100,
const int num_warmups = 100)
{
for (int i{0}; i < num_warmups; ++i)
{
bound_function();
}

std::chrono::steady_clock::time_point time_start{
std::chrono::steady_clock::now()};
for (int i{0}; i < num_repeats; ++i)
{
bound_function();
}
std::chrono::steady_clock::time_point time_end{
std::chrono::steady_clock::now()};

auto time_elapsed{std::chrono::duration_cast<std::chrono::milliseconds>(
time_end - time_start)
.count()};
float latency{time_elapsed / static_cast<float>(num_repeats)};

return latency;
}

double test_function(double a, double const& b)
{
std::this_thread::sleep_for(std::chrono::milliseconds(10));
return a + b;
}

int main()
{
constexpr int num_repeats{10};
constexpr int num_warmups{10};

std::vector<std::function<double(void)>> test_functions;
std::vector<std::tuple<double, double>> test_function_inputs{
{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}};

for (auto const& inputs : test_function_inputs)
{
std::function<double(void)> function{
std::bind(test_function, std::get<0>(inputs), std::get<1>(inputs))};
test_functions.push_back(function);
}

for (auto const& function : test_functions)
{
float latency{measure_performance(function, num_repeats, num_warmups)};
std::cout << std::fixed << std::setprecision(3)
<< "Latency: " << latency << " ms" << std::endl;
}
}
1
2
3
4
5
$ g++ bind.cpp -o bind --std=c++14
$ ./bind
Latency: 10.100 ms
Latency: 10.100 ms
Latency: 10.100 ms

Python

Similar to the C++ version, the key is to use functools.partial to bind functions and input parameters before performance measurement.

bind.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import time
import functools
from timeit import default_timer as timer
from datetime import timedelta


def measure_performance(bound_function: functools.partial,
num_repeats: int = 100,
num_warmups: int = 100) -> float:

for _ in range(num_warmups):
bound_function()

time_start = timer()
for _ in range(num_repeats):
bound_function()
time_end = timer()

time_elapsed = timedelta(seconds=time_end -
time_start).total_seconds() * 1000

latency = time_elapsed / num_repeats

return latency


def test_function(a: float, b: float) -> float:

time.sleep(10 / 1000)

return a + b


if __name__ == "__main__":

num_repeats = 10
num_warmups = 10

test_functions = []
test_function_inputs = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)]

for inputs in test_function_inputs:
function = functools.partial(test_function, inputs[0], inputs[1])
test_functions.append(function)

for function in test_functions:
latency = measure_performance(bound_function=function,
num_repeats=num_repeats,
num_warmups=num_warmups)
print(f"Latency: {latency:.3f} ms")
1
2
3
4
$ python bind.py 
Latency: 10.081 ms
Latency: 10.070 ms
Latency: 10.090 ms

CUDA

Similar to the C++ example, we also use std::function and std::bind to bind functions and input parameters before performance measurement. However, to measure the latency using CUDA event, we will need to know which CUDA stream the kernel is launched on and synchronize accordingly.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>
#include <thread>
#include <tuple>
#include <utility>
#include <vector>

#include <cuda_runtime.h>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
void check(T err, const char* const func, const char* const file,
const int line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(const char* const file, const int line)
{
cudaError_t err{cudaGetLastError()};
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << std::endl;
std::exit(EXIT_FAILURE);
}
}

template <class T>
float measure_performance(std::function<T(cudaStream_t)> bound_function,
cudaStream_t stream, const int num_repeats = 100,
const int num_warmups = 100)
{
cudaEvent_t start, stop;
float time;

CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));

for (int i{0}; i < num_warmups; ++i)
{
bound_function(stream);
}

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));

CHECK_CUDA_ERROR(cudaEventRecord(start, stream));
for (int i{0}; i < num_repeats; ++i)
{
bound_function(stream);
}
CHECK_CUDA_ERROR(cudaEventRecord(stop, stream));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
CHECK_LAST_CUDA_ERROR();
CHECK_CUDA_ERROR(cudaEventElapsedTime(&time, start, stop));
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));

float latency{time / num_repeats};

return latency;
}

__global__ void float_addition(float* output, float const* input_1,
float const* input_2, uint32_t const n)
{
const uint32_t idx{blockDim.x * blockIdx.x + threadIdx.x};
const uint32_t stride{blockDim.x * gridDim.x};
for (uint32_t i{idx}; i < n; i += stride)
{
output[i] = input_1[i] + input_2[i];
}
}

void launch_float_addition(float* output, float const* input_1,
float const* input_2, uint32_t const n,
cudaStream_t stream)
{
dim3 threads_per_block{1024};
dim3 blocks_per_grid{32};
float_addition<<<blocks_per_grid, threads_per_block, 0, stream>>>(
output, input_1, input_2, n);
}

int main()
{
constexpr int num_repeats{10};
constexpr int num_warmups{10};

constexpr int n{1000000};
constexpr int num_test_functions{3};
cudaStream_t stream;

float *d_input_1, *d_input_2, *d_output;

CHECK_CUDA_ERROR(cudaMalloc(&d_input_1, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_input_2, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_output, n * sizeof(float)));

CHECK_CUDA_ERROR(cudaStreamCreate(&stream));

std::vector<std::function<void(cudaStream_t)>> test_functions;

for (int i{0}; i < num_test_functions; ++i)
{
std::function<void(cudaStream_t)> function{
std::bind(launch_float_addition, d_input_1, d_input_2, d_output, n,
std::placeholders::_1)};
test_functions.push_back(function);
}

for (auto const& function : test_functions)
{
float latency{
measure_performance(function, stream, num_repeats, num_warmups)};
std::cout << std::fixed << std::setprecision(3)
<< "Latency: " << latency << " ms" << std::endl;
}

CHECK_CUDA_ERROR(cudaStreamDestroy(stream));
}
1
2
3
4
5
$ nvcc bind.cu -o bind -std=c++14
$ ./bind
Latency: 0.025 ms
Latency: 0.026 ms
Latency: 0.025 ms

References

Author

Lei Mao

Posted on

04-07-2022

Updated on

05-12-2022

Licensed under


Comments