1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
| #include <chrono> #include <functional> #include <iomanip> #include <iostream> #include <thread> #include <tuple> #include <utility> #include <vector>
#include <cuda_runtime.h>
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) void check(cudaError_t err, const char* const func, const char* const file, const int line) { if (err != cudaSuccess) { std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; std::cerr << cudaGetErrorString(err) << " " << func << std::endl; std::exit(EXIT_FAILURE); } }
#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) void checkLast(const char* const file, const int line) { cudaError_t const err{cudaGetLastError()}; if (err != cudaSuccess) { std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; std::cerr << cudaGetErrorString(err) << std::endl; std::exit(EXIT_FAILURE); } }
template <class T> float measure_performance(std::function<T(cudaStream_t)> bound_function, cudaStream_t stream, size_t num_repeats = 100, size_t num_warmups = 100) { cudaEvent_t start, stop; float time;
CHECK_CUDA_ERROR(cudaEventCreate(&start)); CHECK_CUDA_ERROR(cudaEventCreate(&stop));
for (size_t i{0}; i < num_warmups; ++i) { bound_function(stream); }
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
CHECK_CUDA_ERROR(cudaEventRecord(start, stream)); for (size_t i{0}; i < num_repeats; ++i) { bound_function(stream); } CHECK_CUDA_ERROR(cudaEventRecord(stop, stream)); CHECK_CUDA_ERROR(cudaEventSynchronize(stop)); CHECK_LAST_CUDA_ERROR(); CHECK_CUDA_ERROR(cudaEventElapsedTime(&time, start, stop)); CHECK_CUDA_ERROR(cudaEventDestroy(start)); CHECK_CUDA_ERROR(cudaEventDestroy(stop));
float const latency{time / num_repeats};
return latency; }
__global__ void float_addition(float* output, float const* input_1, float const* input_2, size_t n) { size_t const idx{blockDim.x * blockIdx.x + threadIdx.x}; size_t const stride{blockDim.x * gridDim.x}; for (size_t i{idx}; i < n; i += stride) { output[i] = input_1[i] + input_2[i]; } }
void launch_float_addition(float* output, float const* input_1, float const* input_2, size_t n, cudaStream_t stream) { dim3 const threads_per_block{1024}; dim3 const blocks_per_grid{32}; float_addition<<<blocks_per_grid, threads_per_block, 0, stream>>>( output, input_1, input_2, n); CHECK_LAST_CUDA_ERROR(); }
int main() { constexpr size_t num_repeats{10}; constexpr size_t num_warmups{10};
constexpr size_t n{1000000}; constexpr size_t num_test_functions{3}; cudaStream_t stream;
float *d_input_1, *d_input_2, *d_output;
CHECK_CUDA_ERROR(cudaMalloc(&d_input_1, n * sizeof(float))); CHECK_CUDA_ERROR(cudaMalloc(&d_input_2, n * sizeof(float))); CHECK_CUDA_ERROR(cudaMalloc(&d_output, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaStreamCreate(&stream));
std::vector<std::function<void(cudaStream_t)>> test_functions;
for (int i{0}; i < num_test_functions; ++i) { std::function<void(cudaStream_t)> function{ std::bind(launch_float_addition, d_output, d_input_1, d_input_2, n, std::placeholders::_1)}; test_functions.push_back(function); }
for (auto const& function : test_functions) { float const latency{ measure_performance(function, stream, num_repeats, num_warmups)}; std::cout << std::fixed << std::setprecision(3) << "Latency: " << latency << " ms" << std::endl; }
CHECK_CUDA_ERROR(cudaFree(d_input_1)); CHECK_CUDA_ERROR(cudaFree(d_input_2)); CHECK_CUDA_ERROR(cudaFree(d_output)); CHECK_CUDA_ERROR(cudaStreamDestroy(stream)); }
|