CUDA Default Stream

Introduction

CUDA default stream can have different synchronization behaviors in different scenarios. Sometimes, it helps the program to run correctly even if we made some mistakes in assigning the CUDA streams to different kernels.

In this blog post, I would like to introduce the two types of the CUDA default streams, the default legacy stream and the default per-thread stream, and discuss their synchronization behaviors in different scenarios.

Default Stream and Non-Default Blocking Stream

In the following example, I created a non-default blocking stream using cudaStreamCreate. For a series of CUDA kernels that is supposed to be run in sequence on the same non-default blocking CUDA stream, I made a mistake and accidentally used the default stream for one of the kernels.

If the default stream is a default legacy stream, when an action is taken in the legacy stream such as a kernel launch or cudaStreamWaitEvent(), the legacy stream first waits on all blocking streams, the action is queued in the legacy stream, and then all blocking streams wait on the legacy stream. Therefore, even if I made a mistake, the CUDA kernels are still run in sequence and the correctness of the application is not affected.

If the default stream is a default per-thread stream, it is non-blocking and will not synchronize with other CUDA streams. Therefore, my mistake will cause the application to run incorrectly.

add.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <cassert>
#include <iostream>
#include <vector>

#include <cuda_runtime.h>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
void check(cudaError_t err, const char* const func, const char* const file,
const int line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(const char* const file, const int line)
{
cudaError_t const err{cudaGetLastError()};
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << std::endl;
std::exit(EXIT_FAILURE);
}
}

__global__ void add_val_in_place(int32_t* data, int32_t val, uint32_t n)
{
uint32_t const idx{blockDim.x * blockIdx.x + threadIdx.x};
uint32_t const stride{blockDim.x * gridDim.x};
for (uint32_t i{idx}; i < n; i += stride)
{
data[i] += val;
}
}

void launch_add_val_in_place(int32_t* data, int32_t val, uint32_t n,
cudaStream_t stream)
{
dim3 const threads_per_block{1024};
dim3 const blocks_per_grid{32};
add_val_in_place<<<blocks_per_grid, threads_per_block, 0, stream>>>(data,
val, n);
CHECK_LAST_CUDA_ERROR();
}

bool check_array_value(int32_t const* data, uint32_t n, int32_t val)
{
for (uint32_t i{0}; i < n; ++i)
{
if (data[i] != val)
{
return false;
}
}
return true;
}

int main()
{
constexpr uint32_t const n{1000000};
constexpr int32_t const val_1{1};
constexpr int32_t const val_2{2};
constexpr int32_t const val_3{3};
// Create an multi-stream application.
cudaStream_t stream_1{0};
cudaStream_t stream_2{0};
// stream_1 is a non-default blocking stream.
CHECK_CUDA_ERROR(cudaStreamCreate(&stream_1));

std::vector<int32_t> vec(n, 0);
int32_t* d_data{nullptr};
CHECK_CUDA_ERROR(cudaMalloc(&d_data, n * sizeof(int32_t)));
CHECK_CUDA_ERROR(cudaMemcpy(d_data, vec.data(), n * sizeof(int32_t),
cudaMemcpyHostToDevice));
// Run a sequence of CUDA kernels in order on the same CUDA stream.
launch_add_val_in_place(d_data, val_1, n, stream_1);
// The second kernel launch is supposed to be run on stream_1.
// However, the implementation has a typo such that the kernel launch
// is run on the default stream_2.
launch_add_val_in_place(d_data, val_2, n, stream_2);
launch_add_val_in_place(d_data, val_3, n, stream_1);

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_1));
CHECK_CUDA_ERROR(cudaMemcpy(vec.data(), d_data, n * sizeof(int32_t),
cudaMemcpyDeviceToHost));

// Check the correctness of the application.
// Yet the result will still be correct if the default stream_2
// is a legacy default stream.
assert(check_array_value(vec.data(), n, val_1 + val_2 + val_3));

CHECK_CUDA_ERROR(cudaFree(d_data));
CHECK_CUDA_ERROR(cudaStreamDestroy(stream_1));
}

We made in the implementation that the three kernels are not run in the same CUDA stream, yet the result is still correct.

1
2
$ nvcc add.cu -o add -std=c++14
$ ./add

This is the same as running the follow command as the default value for --default-stream is legacy.

1
2
$ nvcc add.cu -o add -std=c++14 --default-stream=legacy
$ ./add

Depending on the use cases, this kind of mistake sometimes may affect application performance. It could usually be identified using CUDA profiling software, such as Nsight Systems.

However, if the default stream becomes per-thread, the result is no longer correct, because the kernel launch are no longer issued in sequence.

1
2
3
4
$ nvcc add.cu -o add -std=c++14 --default-stream=per-thread
$ ./add
add: add.cu:98: int main(): Assertion `check_array_value(vec.data(), n, val_1 + val_2 + val_3)' failed.
Aborted (core dumped)

Default Stream and Non-Default Non-Blocking Stream

In some applications, the non-default stream can be created using cudaStreamCreateWithFlags and the non-default stream created becomes non-blocking. In this case, the default stream, even if it is the default legacy stream, cannot synchronize with the non-default non-blocking stream. Therefore, Therefore, my mistake will cause the application to run incorrectly, regardless whether the non-default stream is legacy or per-thread.

add.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <cassert>
#include <iostream>
#include <vector>

#include <cuda_runtime.h>

#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
void check(cudaError_t err, const char* const func, const char* const file,
const int line)
{
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
std::exit(EXIT_FAILURE);
}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
void checkLast(const char* const file, const int line)
{
cudaError_t const err{cudaGetLastError()};
if (err != cudaSuccess)
{
std::cerr << "CUDA Runtime Error at: " << file << ":" << line
<< std::endl;
std::cerr << cudaGetErrorString(err) << std::endl;
std::exit(EXIT_FAILURE);
}
}

__global__ void add_val_in_place(int32_t* data, int32_t val, uint32_t n)
{
uint32_t const idx{blockDim.x * blockIdx.x + threadIdx.x};
uint32_t const stride{blockDim.x * gridDim.x};
for (uint32_t i{idx}; i < n; i += stride)
{
data[i] += val;
}
}

void launch_add_val_in_place(int32_t* data, int32_t val, uint32_t n,
cudaStream_t stream)
{
dim3 const threads_per_block{1024};
dim3 const blocks_per_grid{32};
add_val_in_place<<<blocks_per_grid, threads_per_block, 0, stream>>>(data,
val, n);
CHECK_LAST_CUDA_ERROR();
}

bool check_array_value(int32_t const* data, uint32_t n, int32_t val)
{
for (uint32_t i{0}; i < n; ++i)
{
if (data[i] != val)
{
return false;
}
}
return true;
}

int main()
{
constexpr uint32_t const n{1000000};
constexpr int32_t const val_1{1};
constexpr int32_t const val_2{2};
constexpr int32_t const val_3{3};
// Create an multi-stream application.
cudaStream_t stream_1{0};
cudaStream_t stream_2{0};
// stream_1 is a non-default non-blocking stream.
CHECK_CUDA_ERROR(cudaStreamCreateWithFlags(&stream_1, cudaStreamNonBlocking));

std::vector<int32_t> vec(n, 0);
int32_t* d_data{nullptr};
CHECK_CUDA_ERROR(cudaMalloc(&d_data, n * sizeof(int32_t)));
CHECK_CUDA_ERROR(cudaMemcpy(d_data, vec.data(), n * sizeof(int32_t),
cudaMemcpyHostToDevice));
// Run a sequence of CUDA kernels in order on the same CUDA stream.
launch_add_val_in_place(d_data, val_1, n, stream_1);
// The second kernel launch is supposed to be run on stream_1.
// However, the implementation has a typo so that the kernel launch
// is run on the default stream_2.
launch_add_val_in_place(d_data, val_2, n, stream_2);
launch_add_val_in_place(d_data, val_3, n, stream_1);

CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_1));
CHECK_CUDA_ERROR(cudaMemcpy(vec.data(), d_data, n * sizeof(int32_t),
cudaMemcpyDeviceToHost));

// Check the correctness of the application.
// Yet the result will still be correct if the default stream_2
// is a legacy default stream.
assert(check_array_value(vec.data(), n, val_1 + val_2 + val_3));

CHECK_CUDA_ERROR(cudaFree(d_data));
CHECK_CUDA_ERROR(cudaStreamDestroy(stream_1));
}
1
2
3
4
$ nvcc add.cu -o add -std=c++14 --default-stream=legacy
$ ./add
add: add.cu:98: int main(): Assertion `check_array_value(vec.data(), n, val_1 + val_2 + val_3)' failed.
Aborted (core dumped)
1
2
3
4
$ nvcc add.cu -o add -std=c++14 --default-stream=per-thread
$ ./add
add: add.cu:98: int main(): Assertion `check_array_value(vec.data(), n, val_1 + val_2 + val_3)' failed.
Aborted (core dumped)

References

Author

Lei Mao

Posted on

11-06-2023

Updated on

11-06-2023

Licensed under


Comments