Lei Mao bio photo

Lei Mao

Machine Learning, Artificial Intelligence, Computer Science.

Twitter Facebook LinkedIn GitHub   G. Scholar E-Mail RSS

Introduction

In CUDA programming, to achieve the maximum utilization of GPU, we will often use multiple CUDA streams in the implementation. Then we have a question. Do we implement the CUDA program in a multi-thread fashion and each thread uses one CUDA stream or a single-thread fashion and the thread uses multiple CUDA streams?


In this blog post, I implemented a high-performance addition program and compared the performance between multi-thread single-stream CUDA and single-thread multi-stream CUDA.

High-Performance Addition

In this example, I implemented the array addition using CPU and CUDA. We could adjust the array size, number of additions to perform, number of threads, and number of CUDA streams per thread, and measure the performance latency.


All the tests were performed on an x86-64 Ubuntu 20.04 LTS desktop with Intel i9-9900K CPU and NVIDIA RTX 2080 TI GPU.

// add.cu
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>

#include <cuda_runtime.h>

cudaError_t checkCuda(cudaError_t status)
{
#if defined(NDEBUG) || defined(_NDEBUG)
    if (status != cudaSuccess)
    {
        std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(status)
                  << std::endl;
        std::exit(EXIT_FAILURE);
    }
#endif
    return status;
}

__global__ void add_n_kernel(int* ptr, const unsigned int n, const size_t size)
{
    const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    const size_t stride = blockDim.x * gridDim.x;

    for (size_t i = idx; i < size; i += stride)
    {
        for (unsigned int j = 0; j < n; j++)
        {
            ptr[i] += 1;
        }
    }
}

void cpu_add_n(int* ptr, const unsigned int n, const size_t size)
{
    for (size_t i = 0; i < size; i++)
    {
        for (unsigned int j = 0; j < n; j++)
        {
            ptr[i] += 1;
        }
    }
}

void cuda_add_n(int* h_data, int* d_data, const unsigned int n,
                const size_t size, const unsigned int num_streams,
                cudaStream_t* streams)
{
    const size_t block_size{256};
    const size_t stream_size{size / num_streams};
    size_t grid_size = 1;
    if (stream_size / block_size != 0)
    {
        grid_size = stream_size / block_size;
    }
    const size_t stream_bytes{stream_size * sizeof(int)};

    for (unsigned int i = 0; i < num_streams - 1; i++)
    {
        const size_t offset = i * stream_size;
        checkCuda(cudaMemcpyAsync(d_data + offset, h_data + offset,
                                  stream_bytes, cudaMemcpyHostToDevice,
                                  streams[i]));
        add_n_kernel<<<grid_size, block_size, 0, streams[i]>>>(d_data + offset,
                                                               n, stream_size);
        checkCuda(cudaMemcpyAsync(h_data + offset, d_data + offset,
                                  stream_bytes, cudaMemcpyDeviceToHost,
                                  streams[i]));
    }
    const size_t stream_size_remain = size - (num_streams - 1) * stream_size;
    const size_t stream_bytes_remain = stream_size_remain * sizeof(int);
    const size_t offset = (num_streams - 1) * stream_size;
    checkCuda(cudaMemcpyAsync(d_data + offset, h_data + offset,
                              stream_bytes_remain, cudaMemcpyHostToDevice,
                              streams[num_streams - 1]));
    add_n_kernel<<<grid_size, block_size, 0, streams[num_streams - 1]>>>(
        d_data + offset, n, stream_size_remain);
    checkCuda(cudaMemcpyAsync(h_data + offset, d_data + offset,
                              stream_bytes_remain, cudaMemcpyDeviceToHost,
                              streams[num_streams - 1]));

    return;
}

void thread_add_n(int* h_data, int* d_data, const unsigned int n,
                  const size_t size, const unsigned int num_streams,
                  cudaStream_t* streams)
{
    // CPU add
    if (num_streams == 0)
    {
        cpu_add_n(h_data, n, size);
    }
    // CUDA add
    else
    {
        cuda_add_n(h_data, d_data, n, size, num_streams, streams);
    }
    return;
}

// Multithread add_n
// Each thread uses n stream
void multithread_add_n(int* h_data, int* d_data, const unsigned int n,
                       const size_t size, const unsigned int num_threads,
                       const unsigned int num_streams_per_thread,
                       const bool verbose, const unsigned int num_tests)
{

    const unsigned int num_streams{num_threads * num_streams_per_thread};

    std::vector<cudaStream_t> streams(num_streams);
    for (unsigned int i = 0; i < streams.size(); i++)
    {
        checkCuda(cudaStreamCreate(&streams.at(i)));
    }

    float duration_total = 0;

    for (int k = 0; k < num_tests; k++)
    {
        std::vector<std::thread> threads;
        const size_t thread_size{size / num_threads};

        std::chrono::steady_clock::time_point begin =
            std::chrono::steady_clock::now();

        for (unsigned int i = 0; i < num_threads - 1; i++)
        {
            const size_t offset = i * thread_size;
            threads.emplace_back(thread_add_n, h_data + offset, d_data + offset,
                                 n, thread_size, num_streams_per_thread,
                                 streams.data() + i * num_streams_per_thread);
        }
        const size_t thread_size_remain =
            size - (num_threads - 1) * thread_size;
        const size_t offset = (num_threads - 1) * thread_size;
        threads.emplace_back(thread_add_n, h_data + offset, d_data + offset, n,
                             thread_size_remain, num_streams_per_thread,
                             streams.data() +
                                 (num_threads - 1) * num_streams_per_thread);

        for (unsigned int i = 0; i < num_streams; i++)
        {
            checkCuda(cudaStreamSynchronize(streams.at(i)));
        }

        for (unsigned int i = 0; i < num_threads; i++)
        {
            threads.at(i).join();
        }

        std::chrono::steady_clock::time_point end =
            std::chrono::steady_clock::now();

        duration_total +=
            std::chrono::duration_cast<std::chrono::microseconds>(end - begin)
                .count();
    }

    for (unsigned int i = 0; i < streams.size(); i++)
    {
        checkCuda(cudaStreamDestroy(streams.at(i)));
    }

    if (verbose)
    {
        std::cout << "Average Latency: " << std::setprecision(2) << std::fixed
                  << duration_total / 1000 / num_tests << " ms"
                  << std::endl;
    }

    return;
}

bool verify_add_n(const std::vector<int>& vector,
                  const std::vector<int>& vector_original, const unsigned int n)
{
    if (vector.size() != vector_original.size())
    {
        return false;
    }
    for (size_t i = 0; i < vector.size(); i++)
    {
        if (vector.at(i) - vector_original.at(i) != n)
        {
            return false;
        }
    }
    return true;
}

int main(int argc, char* argv[])
{
    size_t size{10000000}; // 10 ** 7
    unsigned int n{100};
    unsigned int num_threads{1};
    unsigned int num_streams_per_thread{16};

    if (argc == 5)
    {
        size = atoi(argv[1]);
        n = atoi(argv[2]);
        num_threads = atoi(argv[3]);
        num_streams_per_thread = atoi(argv[4]);
    }

    std::cout << "Array Size: " << size << std::endl;
    std::cout << "Number of Additions: " << n << std::endl;
    std::cout << "Number of Threads: " << num_threads << std::endl;
    std::cout << "Number of Streams Per Thread: " << num_streams_per_thread
              << std::endl;

    // Set CUDA device
    checkCuda(cudaSetDevice(0));

    // Create a vector and initialize it with zeros
    std::vector<int> vector(size, 0);
    std::vector<int> vector_clone{vector};

    int* h_data = vector.data();
    int* d_data;
    const size_t bytes = size * sizeof(int);
    checkCuda(cudaMalloc((void**)&d_data, bytes));

    multithread_add_n(h_data, d_data, n, size, num_threads,
                      num_streams_per_thread, false, 1);

    assert(verify_add_n(vector, vector_clone, n) &&
           "The add_n implementation is incorrect.");
    
    // Warm up
    multithread_add_n(h_data, d_data, n, size, num_threads,
        num_streams_per_thread, false, 100);
    // Measure latency
    multithread_add_n(h_data, d_data, n, size, num_threads,
                      num_streams_per_thread, true, 1000);

    checkCuda(cudaFree(d_data));
    // Reserved for cuda-memcheck
    cudaDeviceReset();
}

To build the application, please run the following command in the terminal.

$ docker run -it --rm --gpus all --privileged -v $(pwd):/mnt nvcr.io/nvidia/cuda:11.4.1-cudnn8-devel-ubuntu20.04
$ cd /mnt/
$ nvcc -o add add.cu -lpthread

The application consumes $4$ arguments, array size, number of additions to perform, number of threads, and the number of CUDA streams per thread.


For example, ./add 100 10 8 1 means running the application for an array of size 100, performing addition 10 times, distributed across 8 threads and each thread uses 1 CUDA stream.

$ ./add 100 10 8 1
Array Size: 100
Number of Additions: 10
Number of Threads: 8
Number of Streams Per Thread: 1
Average Latency: 0.14 ms

Similarly, ./add 100 10 8 0 means running the application for an array of size 100, performing addition 10 times, distributed across 8 threads using CPU only.

$ ./add 100 10 8 0
Array Size: 100
Number of Additions: 10
Number of Threads: 8
Number of Streams Per Thread: 0
Average Latency: 0.10 ms

Math-Bound VS Memory-Bound

In my previous blog post “Math-Bound VS Memory-Bound Operations”, we have discussed math-bound and memory-bound operations. In our particular program, we could adjust the operation to be math-bound or memory-bound by adjusting the number of additions.


From the performance measured, we could see that GPU is extremely good at performing math-bound operations, whereas for memory-bound operations GPU did not show significant advantages.

$ ./add 10000000 100 16 0
Array Size: 10000000
Number of Additions: 100
Number of Threads: 16
Number of Streams Per Thread: 0
Average Latency: 176.47 ms
$ ./add 10000000 100 16 1
Array Size: 10000000
Number of Additions: 100
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 10.93 ms
$ ./add 10000000 1 16 0
Array Size: 10000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 0
Average Latency: 2.90 ms
$ ./add 10000000 1 16 1
Array Size: 10000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 12.20 ms

In fact, even performing addition $100$ times does not make the operation math-bound on GPU. The time spent on executing the kernel is only $1.48\%$, whereas the rest of the time were spent on memory copy.

$ nvprof ./add 10000000 100 16 1
Array Size: 10000000
Number of Additions: 100
Number of Threads: 16
Number of Streams Per Thread: 1
==37708== NVPROF is profiling process 37708, command: ./add 10000000 100 16 1
Average Latency: 18.62 ms
==37708== Profiling application: ./add 10000000 100 16 1
==37708== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   49.61%  488.55ms      1616  302.32us  193.19us  557.79us  [CUDA memcpy DtoH]
                   48.91%  481.69ms      1616  298.08us  197.07us  578.39us  [CUDA memcpy HtoD]
                    1.48%  14.574ms      1616  9.0180us  7.2960us  30.082us  add_n_kernel(int*, unsigned int, unsigned long)
      API calls:   92.45%  15.6360s      3232  4.8379ms  201.53us  24.250ms  cudaMemcpyAsync
                    6.70%  1.13320s      1616  701.24us  6.5280us  23.786ms  cudaLaunchKernel
                    0.57%  97.205ms         1  97.205ms  97.205ms  97.205ms  cudaMalloc
                    0.21%  35.013ms         1  35.013ms  35.013ms  35.013ms  cudaDeviceReset
                    0.06%  9.6712ms      1616  5.9840us     401ns  716.81us  cudaStreamSynchronize
                    0.00%  467.63us         1  467.63us  467.63us  467.63us  cudaFree
                    0.00%  347.04us         1  347.04us  347.04us  347.04us  cuDeviceTotalMem
                    0.00%  224.06us       101  2.2180us     205ns  100.11us  cuDeviceGetAttribute
                    0.00%  166.07us        32  5.1890us  1.0530us  36.000us  cudaStreamDestroy
                    0.00%  121.61us        32  3.8000us     884ns  68.764us  cudaStreamCreate
                    0.00%  32.067us         1  32.067us  32.067us  32.067us  cuDeviceGetName
                    0.00%  4.7420us         1  4.7420us  4.7420us  4.7420us  cuDeviceGetPCIBusId
                    0.00%  3.6380us         1  3.6380us  3.6380us  3.6380us  cudaSetDevice
                    0.00%  1.6830us         3     561ns     259ns  1.0390us  cuDeviceGetCount
                    0.00%  1.0850us         2     542ns     206ns     879ns  cuDeviceGet
                    0.00%     455ns         1     455ns     455ns     455ns  cuDeviceGetUuid

If we increase the number of additions to $1000000$ for which the CPU can hardly handle, GPU could still perform extremely well. The operation has also become math-bound, since the time spent on executing the kernel is now $97.43\%$.

$ nvprof ./add 10000000 1000000 16 1
Array Size: 10000000
Number of Additions: 1000000
Number of Threads: 16
Number of Streams Per Thread: 1
==49064== NVPROF is profiling process 49064, command: ./add 10000000 1000000 16 1
Average Latency: 263.62 ms
==49064== Profiling application: ./add 10000000 1000000 16 1
==49064== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   97.43%  25.7634s      1616  15.943ms  14.440ms  26.953ms  add_n_kernel(int*, unsigned int, unsigned long)
                    1.29%  341.34ms      1616  211.23us  197.22us  465.84us  [CUDA memcpy HtoD]
                    1.28%  339.55ms      1616  210.12us  195.46us  467.99us  [CUDA memcpy DtoH]
      API calls:   94.19%  219.036s      3232  67.771ms  204.40us  331.30ms  cudaMemcpyAsync
                    5.75%  13.3678s      1616  8.2721ms  7.3610us  323.28ms  cudaLaunchKernel
                    0.04%  93.263ms         1  93.263ms  93.263ms  93.263ms  cudaMalloc
                    0.02%  35.650ms         1  35.650ms  35.650ms  35.650ms  cudaDeviceReset
                    0.00%  6.2746ms      1616  3.8820us     399ns  191.04us  cudaStreamSynchronize
                    0.00%  205.59us         1  205.59us  205.59us  205.59us  cuDeviceTotalMem
                    0.00%  121.86us       101  1.2060us     118ns  51.089us  cuDeviceGetAttribute
                    0.00%  114.43us        32  3.5750us     857ns  68.004us  cudaStreamCreate
                    0.00%  112.33us         1  112.33us  112.33us  112.33us  cudaFree
                    0.00%  64.705us        32  2.0220us  1.2410us  10.551us  cudaStreamDestroy
                    0.00%  20.017us         1  20.017us  20.017us  20.017us  cuDeviceGetName
                    0.00%  4.6830us         1  4.6830us  4.6830us  4.6830us  cuDeviceGetPCIBusId
                    0.00%  2.9030us         1  2.9030us  2.9030us  2.9030us  cudaSetDevice
                    0.00%     990ns         3     330ns     165ns     638ns  cuDeviceGetCount
                    0.00%     668ns         2     334ns     133ns     535ns  cuDeviceGet
                    0.00%     205ns         1     205ns     205ns     205ns  cuDeviceGetUuid

Multi-Thread Single-Stream VS Single-Thread Multi-Stream

Here we tried to compare the performance between multi-thread single-stream CUDA and single-thread multi-stream CUDA. Concretely, we compared the addition performance for the following two case:

  • 1 thread and the thread has 16 CUDA streams.
  • 16 threads and each thread has 1 CUDA stream.

From the performance latency we measured, we could see that for the math-bound operations there is no significant performance difference between the two cases, whereas for the memory-bound operations the single-thread multi-stream implementation is faster.

$ ./add 100000000 1 16 1
Array Size: 100000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 64.67 ms
$ ./add 100000000 1 1 16
Array Size: 100000000
Number of Additions: 1
Number of Threads: 1
Number of Streams Per Thread: 16
Average Latency: 70.82 ms
$ ./add 10000000 1 16 1
Array Size: 10000000
Number of Additions: 1
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 10.94 ms
$ ./add 10000000 1 1 16
Array Size: 10000000
Number of Additions: 1
Number of Threads: 1
Number of Streams Per Thread: 16
Average Latency: 9.08 ms
$ ./add 10000000 1000000 16 1
Array Size: 10000000
Number of Additions: 1000000
Number of Threads: 16
Number of Streams Per Thread: 1
Average Latency: 242.83 ms
$ ./add 10000000 1000000 1 16
Array Size: 10000000
Number of Additions: 1000000
Number of Threads: 1
Number of Streams Per Thread: 16
Average Latency: 250.37 ms

Summary

All the experiment results are summarized below.

Array Size Number of Additions Number of Threads Number of Streams Per Thread Average Latency (ms)
10000000 1 16 0 2.90
10000000 1 16 1 12.20
10000000 1 1 16 9.08
10000000 100 16 0 176.47
10000000 100 16 1 10.93
10000000 1000000 16 1 242.83
10000000 1000000 1 16 250.37
100000000 1 16 1 64.67
100000000 1 1 16 70.82

Conclusion

The latency difference between multi-thread single-stream CUDA and single-thread multi-stream CUDA is small.