1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
| #include <chrono> #include <functional> #include <iomanip> #include <iostream> #include <thread> #include <tuple> #include <utility> #include <vector>
#include <cuda_runtime.h>
#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) template <typename T> void check(T err, const char* const func, const char* const file, const int line) { if (err != cudaSuccess) { std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; std::cerr << cudaGetErrorString(err) << " " << func << std::endl; std::exit(EXIT_FAILURE); } }
#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) void checkLast(const char* const file, const int line) { cudaError_t err{cudaGetLastError()}; if (err != cudaSuccess) { std::cerr << "CUDA Runtime Error at: " << file << ":" << line << std::endl; std::cerr << cudaGetErrorString(err) << std::endl; std::exit(EXIT_FAILURE); } }
template <class T> float measure_performance(std::function<T(cudaStream_t)> bound_function, cudaStream_t stream, const int num_repeats = 100, const int num_warmups = 100) { cudaEvent_t start, stop; float time;
CHECK_CUDA_ERROR(cudaEventCreate(&start)); CHECK_CUDA_ERROR(cudaEventCreate(&stop));
for (int i{0}; i < num_warmups; ++i) { bound_function(stream); }
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
CHECK_CUDA_ERROR(cudaEventRecord(start, stream)); for (int i{0}; i < num_repeats; ++i) { bound_function(stream); } CHECK_CUDA_ERROR(cudaEventRecord(stop, stream)); CHECK_CUDA_ERROR(cudaEventSynchronize(stop)); CHECK_LAST_CUDA_ERROR(); CHECK_CUDA_ERROR(cudaEventElapsedTime(&time, start, stop)); CHECK_CUDA_ERROR(cudaEventDestroy(start)); CHECK_CUDA_ERROR(cudaEventDestroy(stop));
float latency{time / num_repeats};
return latency; }
__global__ void float_addition(float* output, float const* input_1, float const* input_2, uint32_t const n) { const uint32_t idx{blockDim.x * blockIdx.x + threadIdx.x}; const uint32_t stride{blockDim.x * gridDim.x}; for (uint32_t i{idx}; i < n; i += stride) { output[i] = input_1[i] + input_2[i]; } }
void launch_float_addition(float* output, float const* input_1, float const* input_2, uint32_t const n, cudaStream_t stream) { dim3 threads_per_block{1024}; dim3 blocks_per_grid{32}; float_addition<<<blocks_per_grid, threads_per_block, 0, stream>>>( output, input_1, input_2, n); }
int main() { constexpr int num_repeats{10}; constexpr int num_warmups{10};
constexpr int n{1000000}; constexpr int num_test_functions{3}; cudaStream_t stream;
float *d_input_1, *d_input_2, *d_output;
CHECK_CUDA_ERROR(cudaMalloc(&d_input_1, n * sizeof(float))); CHECK_CUDA_ERROR(cudaMalloc(&d_input_2, n * sizeof(float))); CHECK_CUDA_ERROR(cudaMalloc(&d_output, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaStreamCreate(&stream));
std::vector<std::function<void(cudaStream_t)>> test_functions;
for (int i{0}; i < num_test_functions; ++i) { std::function<void(cudaStream_t)> function{ std::bind(launch_float_addition, d_input_1, d_input_2, d_output, n, std::placeholders::_1)}; test_functions.push_back(function); }
for (auto const& function : test_functions) { float latency{ measure_performance(function, stream, num_repeats, num_warmups)}; std::cout << std::fixed << std::setprecision(3) << "Latency: " << latency << " ms" << std::endl; }
CHECK_CUDA_ERROR(cudaStreamDestroy(stream)); }
|