Multi-Thread Single-Stream VS Single-Thread Multi-Stream CUDA

Introduction

In CUDA programming, to achieve the maximum utilization of GPU, we will often use multiple CUDA streams in the implementation. Then we have a question. Do we implement the CUDA program in a multi-thread fashion and each thread uses one CUDA stream or a single-thread fashion and the thread uses multiple CUDA streams?

In this blog post, I implemented a high-performance addition program and compared the performance between multi-thread single-stream CUDA and single-thread multi-stream CUDA.

High-Performance Addition

In this example, I implemented the array addition using CPU and CUDA. We could adjust the array size, number of additions to perform, number of threads, and number of CUDA streams per thread, and measure the performance latency.

All the tests were performed on an x86-64 Ubuntu 20.04 LTS desktop with Intel i9-9900K CPU and NVIDIA RTX 2080 TI GPU.

add.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>

#include <cuda_runtime.h>

cudaError_t checkCuda(cudaError_t status)
{
#if defined(NDEBUG) || defined(_NDEBUG)
if (status != cudaSuccess)
{
std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(status)
<< std::endl;
std::exit(EXIT_FAILURE);
}
#endif
return status;
}

__global__ void add_n_kernel(int* ptr, const unsigned int n, const size_t size)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;

for (size_t i = idx; i < size; i += stride)
{
for (unsigned int j = 0; j < n; j++)
{
ptr[i] += 1;
}
}
}

void cpu_add_n(int* ptr, const unsigned int n, const size_t size)
{
for (size_t i = 0; i < size; i++)
{
for (unsigned int j = 0; j < n; j++)
{
ptr[i] += 1;
}
}
}

void cuda_add_n(int* h_data, int* d_data, const unsigned int n,
const size_t size, const unsigned int num_streams,
cudaStream_t* streams)
{
const size_t block_size{256};
const size_t stream_size{size / num_streams};
size_t grid_size = 1;
if (stream_size / block_size != 0)
{
grid_size = stream_size / block_size;
}
const size_t stream_bytes{stream_size * sizeof(int)};

for (unsigned int i = 0; i < num_streams - 1; i++)
{
const size_t offset = i * stream_size;
checkCuda(cudaMemcpyAsync(d_data + offset, h_data + offset,
stream_bytes, cudaMemcpyHostToDevice,
streams[i]));
add_n_kernel<<<grid_size, block_size, 0, streams[i]>>>(d_data + offset,
n, stream_size);
checkCuda(cudaMemcpyAsync(h_data + offset, d_data + offset,
stream_bytes, cudaMemcpyDeviceToHost,
streams[i]));
}
const size_t stream_size_remain = size - (num_streams - 1) * stream_size;
const size_t stream_bytes_remain = stream_size_remain * sizeof(int);
const size_t offset = (num_streams - 1) * stream_size;
checkCuda(cudaMemcpyAsync(d_data + offset, h_data + offset,
stream_bytes_remain, cudaMemcpyHostToDevice,
streams[num_streams - 1]));
add_n_kernel<<<grid_size, block_size, 0, streams[num_streams - 1]>>>(
d_data + offset, n, stream_size_remain);
checkCuda(cudaMemcpyAsync(h_data + offset, d_data + offset,
stream_bytes_remain, cudaMemcpyDeviceToHost,
streams[num_streams - 1]));

return;
}

void thread_add_n(int* h_data, int* d_data, const unsigned int n,
const size_t size, const unsigned int num_streams,
cudaStream_t* streams)
{
// CPU add
if (num_streams == 0)
{
cpu_add_n(h_data, n, size);
}
// CUDA add
else
{
cuda_add_n(h_data, d_data, n, size, num_streams, streams);
}
return;
}

// Multithread add_n
// Each thread uses n stream
void multithread_add_n(int* h_data, int* d_data, const unsigned int n,
const size_t size, const unsigned int num_threads,
const unsigned int num_streams_per_thread,
const bool verbose, const unsigned int num_tests)
{

const unsigned int num_streams{num_threads * num_streams_per_thread};

std::vector<cudaStream_t> streams(num_streams);
for (unsigned int i = 0; i < streams.size(); i++)
{
checkCuda(cudaStreamCreate(&streams.at(i)));
}

float duration_total = 0;

for (int k = 0; k < num_tests; k++)
{
std::vector<std::thread> threads;
const size_t thread_size{size / num_threads};

std::chrono::steady_clock::time_point begin =
std::chrono::steady_clock::now();

for (unsigned int i = 0; i < num_threads - 1; i++)
{
const size_t offset = i * thread_size;
threads.emplace_back(thread_add_n, h_data + offset, d_data + offset,
n, thread_size, num_streams_per_thread,
streams.data() + i * num_streams_per_thread);
}
const size_t thread_size_remain =
size - (num_threads - 1) * thread_size;
const size_t offset = (num_threads - 1) * thread_size;
threads.emplace_back(thread_add_n, h_data + offset, d_data + offset, n,
thread_size_remain, num_streams_per_thread,
streams.data() +
(num_threads - 1) * num_streams_per_thread);

for (unsigned int i = 0; i < num_streams; i++)
{
checkCuda(cudaStreamSynchronize(streams.at(i)));
}

for (unsigned int i = 0; i < num_threads; i++)
{
threads.at(i).join();
}

std::chrono::steady_clock::time_point end =
std::chrono::steady_clock::now();

duration_total +=
std::chrono::duration_cast<std::chrono::microseconds>(end - begin)
.count();
}

for (unsigned int i = 0; i < streams.size(); i++)
{
checkCuda(cudaStreamDestroy(streams.at(i)));
}

if (verbose)
{
std::cout << "Average Latency: " << std::setprecision(2) << std::fixed
<< duration_total / 1000 / num_tests << " ms" << std::endl;
}

return;
}

bool verify_add_n(const std::vector<int>& vector,
const std::vector<int>& vector_original, const unsigned int n)
{
if (vector.size() != vector_original.size())
{
return false;
}
for (size_t i = 0; i < vector.size(); i++)
{
if (vector.at(i) - vector_original.at(i) != n)
{
return false;
}
}
return true;
}

int main(int argc, char* argv[])
{
size_t size{10000000}; // 10 ** 7
unsigned int n{100};
unsigned int num_threads{1};
unsigned int num_streams_per_thread{16};

if (argc == 5)
{
size = atoi(argv[1]);
n = atoi(argv[2]);
num_threads = atoi(argv[3]);
num_streams_per_thread = atoi(argv[4]);
}

std::cout << "Array Size: " << size << std::endl;
std::cout << "Number of Additions: " << n << std::endl;
std::cout << "Number of Threads: " << num_threads << std::endl;
std::cout << "Number of Streams Per Thread: " << num_streams_per_thread
<< std::endl;

// Set CUDA device
checkCuda(cudaSetDevice(0));

// Create a vector and initialize it with zeros
std::vector<int> vector(size, 0);
std::vector<int> vector_clone{vector};

int* h_data;
int* d_data;
const size_t bytes = size * sizeof(int);

// Create pinned memory
checkCuda(cudaMallocHost((void**)&h_data, bytes));
checkCuda(cudaMalloc((void**)&d_data, bytes));
checkCuda(cudaMemcpy((void*)h_data, (void*)vector.data(), bytes,
cudaMemcpyHostToHost));

multithread_add_n(h_data, d_data, n, size, num_threads,
num_streams_per_thread, false, 1);

assert(verify_add_n(vector, vector_clone, n) &&
"The add_n implementation is incorrect.");

// Warm up
multithread_add_n(h_data, d_data, n, size, num_threads,
num_streams_per_thread, false, 100);
// Measure latency
multithread_add_n(h_data, d_data, n, size, num_threads,
num_streams_per_thread, true, 1000);

checkCuda(cudaFree(d_data));
checkCuda(cudaFreeHost(h_data));
// Reserved for cuda-memcheck
cudaDeviceReset();
}

To build the application, please run the following command in the terminal.

1
2
3
$ docker run -it --rm --gpus all --privileged -v $(pwd):/mnt -w /mnt nvcr.io/nvidia/cuda:11.4.1-cudnn8-devel-ubuntu20.04
$ cd /mnt/
$ nvcc -o add add.cu -lpthread

The application consumes $4$ arguments, array size, number of additions to perform, number of threads, and the number of CUDA streams per thread.

For example, ./add 100 10 8 1 means running the application for an array of size 100, performing addition 10 times, distributed across 8 threads and each thread uses 1 CUDA stream.

1
2
3
4
5
6
$ ./add 100 10 8 1
Array Size: 100
Number of Additions: 10
Number of Threads: 8
Number of Streams Per Thread: 1
Average Latency: 0.14 ms

Similarly, ./add 100 10 8 0 means running the application for an array of size 100, performing addition 10 times, distributed across 8 threads using CPU only.

1
2
3
4
5
6
$ ./add 100 10 8 0
Array Size: 100
Number of Additions: 10
Number of Threads: 8
Number of Streams Per Thread: 0
Average Latency: 0.10 ms

Math-Bound VS Memory-Bound

In my previous blog post “Math-Bound VS Memory-Bound Operations”, we have discussed math-bound and memory-bound operations. In our particular program, we could adjust the operation to be math-bound or memory-bound by adjusting the number of additions.

From the performance measured, we could see that GPU is extremely good at performing math-bound operations, whereas for memory-bound operations GPU did not show significant advantages.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
$ ./add 10000000 100 16 0
Array Size: 10000000
Number of Additions: 100
Number of Threads: 16
Number of Streams Per Thread: 0
Average Latency: 176.47 ms
$ ./add 10000000 100 16 1
Array Size: 10000000
Number of Additions: 100
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 10.93 ms
$ ./add 10000000 1 16 0
Array Size: 10000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 0
Average Latency: 2.90 ms
$ ./add 10000000 1 16 1
Array Size: 10000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 12.20 ms

In fact, even performing addition $100$ times does not make the operation math-bound on GPU. The time spent on executing the kernel is only $1.48%$, whereas the rest of the time were spent on memory copy.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
$ nvprof ./add 10000000 100 16 1
Array Size: 10000000
Number of Additions: 100
Number of Threads: 16
Number of Streams Per Thread: 1
==37708== NVPROF is profiling process 37708, command: ./add 10000000 100 16 1
Average Latency: 18.62 ms
==37708== Profiling application: ./add 10000000 100 16 1
==37708== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 49.61% 488.55ms 1616 302.32us 193.19us 557.79us [CUDA memcpy DtoH]
48.91% 481.69ms 1616 298.08us 197.07us 578.39us [CUDA memcpy HtoD]
1.48% 14.574ms 1616 9.0180us 7.2960us 30.082us add_n_kernel(int*, unsigned int, unsigned long)
API calls: 92.45% 15.6360s 3232 4.8379ms 201.53us 24.250ms cudaMemcpyAsync
6.70% 1.13320s 1616 701.24us 6.5280us 23.786ms cudaLaunchKernel
0.57% 97.205ms 1 97.205ms 97.205ms 97.205ms cudaMalloc
0.21% 35.013ms 1 35.013ms 35.013ms 35.013ms cudaDeviceReset
0.06% 9.6712ms 1616 5.9840us 401ns 716.81us cudaStreamSynchronize
0.00% 467.63us 1 467.63us 467.63us 467.63us cudaFree
0.00% 347.04us 1 347.04us 347.04us 347.04us cuDeviceTotalMem
0.00% 224.06us 101 2.2180us 205ns 100.11us cuDeviceGetAttribute
0.00% 166.07us 32 5.1890us 1.0530us 36.000us cudaStreamDestroy
0.00% 121.61us 32 3.8000us 884ns 68.764us cudaStreamCreate
0.00% 32.067us 1 32.067us 32.067us 32.067us cuDeviceGetName
0.00% 4.7420us 1 4.7420us 4.7420us 4.7420us cuDeviceGetPCIBusId
0.00% 3.6380us 1 3.6380us 3.6380us 3.6380us cudaSetDevice
0.00% 1.6830us 3 561ns 259ns 1.0390us cuDeviceGetCount
0.00% 1.0850us 2 542ns 206ns 879ns cuDeviceGet
0.00% 455ns 1 455ns 455ns 455ns cuDeviceGetUuid

If we increase the number of additions to $1000000$ for which the CPU can hardly handle, GPU could still perform extremely well. The operation has also become math-bound, since the time spent on executing the kernel is now $97.43%$.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
$ nvprof ./add 10000000 1000000 16 1
Array Size: 10000000
Number of Additions: 1000000
Number of Threads: 16
Number of Streams Per Thread: 1
==49064== NVPROF is profiling process 49064, command: ./add 10000000 1000000 16 1
Average Latency: 263.62 ms
==49064== Profiling application: ./add 10000000 1000000 16 1
==49064== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.43% 25.7634s 1616 15.943ms 14.440ms 26.953ms add_n_kernel(int*, unsigned int, unsigned long)
1.29% 341.34ms 1616 211.23us 197.22us 465.84us [CUDA memcpy HtoD]
1.28% 339.55ms 1616 210.12us 195.46us 467.99us [CUDA memcpy DtoH]
API calls: 94.19% 219.036s 3232 67.771ms 204.40us 331.30ms cudaMemcpyAsync
5.75% 13.3678s 1616 8.2721ms 7.3610us 323.28ms cudaLaunchKernel
0.04% 93.263ms 1 93.263ms 93.263ms 93.263ms cudaMalloc
0.02% 35.650ms 1 35.650ms 35.650ms 35.650ms cudaDeviceReset
0.00% 6.2746ms 1616 3.8820us 399ns 191.04us cudaStreamSynchronize
0.00% 205.59us 1 205.59us 205.59us 205.59us cuDeviceTotalMem
0.00% 121.86us 101 1.2060us 118ns 51.089us cuDeviceGetAttribute
0.00% 114.43us 32 3.5750us 857ns 68.004us cudaStreamCreate
0.00% 112.33us 1 112.33us 112.33us 112.33us cudaFree
0.00% 64.705us 32 2.0220us 1.2410us 10.551us cudaStreamDestroy
0.00% 20.017us 1 20.017us 20.017us 20.017us cuDeviceGetName
0.00% 4.6830us 1 4.6830us 4.6830us 4.6830us cuDeviceGetPCIBusId
0.00% 2.9030us 1 2.9030us 2.9030us 2.9030us cudaSetDevice
0.00% 990ns 3 330ns 165ns 638ns cuDeviceGetCount
0.00% 668ns 2 334ns 133ns 535ns cuDeviceGet
0.00% 205ns 1 205ns 205ns 205ns cuDeviceGetUuid

Multi-Thread Single-Stream VS Single-Thread Multi-Stream

Here we tried to compare the performance between multi-thread single-stream CUDA and single-thread multi-stream CUDA. Concretely, we compared the addition performance for the following two case:

  • 1 thread and the thread has 16 CUDA streams.
  • 16 threads and each thread has 1 CUDA stream.

From the performance latency we measured, we could see that for the math-bound operations there is no significant performance difference between the two cases, whereas for the memory-bound operations the single-thread multi-stream implementation is faster.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
$ ./add 100000000 1 16 1
Array Size: 100000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 64.67 ms
$ ./add 100000000 1 1 16
Array Size: 100000000
Number of Additions: 1
Number of Threads: 1
Number of Streams Per Thread: 16
Average Latency: 70.82 ms
$ ./add 10000000 1 16 1
Array Size: 10000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 10.94 ms
$ ./add 10000000 1 1 16
Array Size: 10000000
Number of Additions: 1
Number of Threads: 1
Number of Streams Per Thread: 16
Average Latency: 9.08 ms
$ ./add 10000000 1000000 16 1
Array Size: 10000000
Number of Additions: 1000000
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 242.83 ms
$ ./add 10000000 1000000 1 16
Array Size: 10000000
Number of Additions: 1000000
Number of Threads: 1
Number of Streams Per Thread: 16
Average Latency: 250.37 ms

Summary

All the experiment results are summarized below.

Array Size Number of Additions Number of Threads Number of Streams Per Thread Average Latency (ms)
10000000 1 16 0 2.90
10000000 1 16 1 12.20
10000000 1 1 16 9.08
10000000 100 16 0 176.47
10000000 100 16 1 10.93
10000000 1000000 16 1 242.83
10000000 1000000 1 16 250.37
100000000 1 16 1 64.67
100000000 1 1 16 70.82

Conclusion

The latency difference between multi-thread single-stream CUDA and single-thread multi-stream CUDA is small.

Author

Lei Mao

Posted on

10-18-2021

Updated on

05-12-2022

Licensed under


Comments