C++ SIMD CPU Vectorization

Introduction

SIMD stands for “single instruction, multiple data”. With CPU SIMD intrinsics, we could process data in parallel to some limited extent.

In this blog post, I would like to discuss the C++ SIMD CPU vectorization and some of its caveats.

C++ SIMD CPU Vectorization

Vector Normalization

We implemented the vector normalization methods using scalar method, std::valarray method, SSE __m128 data structure and methods, and AVX __m256 data structure and methods, and compared their performances.

vector_normalization.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
// g++ vector_normalization.cpp -o vector_normalization -msse2 -mavx -std=c++17

#include <cassert>
#include <chrono>
#include <cmath>
#include <iomanip>
#include <iostream>
#include <random>
#include <stdexcept>
#include <valarray>
#include <vector>
// https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/immintrin.h
#include <immintrin.h>
#include <type_traits>

template <typename T>
std::vector<std::vector<T>> generate_vectors(const std::size_t num_vectors,
const std::size_t vector_size,
const unsigned long random_seed)
{
std::mt19937 gen{random_seed};
std::uniform_real_distribution<> dist{-1.0, 1.0};

std::vector<std::vector<T>> data(num_vectors);
for (std::size_t i = 0; i < num_vectors; i++)
{
std::vector<T> vec(vector_size);
for (std::size_t j = 0; j < vector_size; j++)
{
vec.at(j) = dist(gen);
}
data.at(i) = vec;
}
return data;
}

/*
template<typename T, size_t N>
union f32xN
{
T m;
float f[N];
};

// warning: ignoring attributes on template argument
typedef f32xN<__m256, 8> f32x8;
typedef f32xN<__m128, 4> f32x4;
*/

union f32x4
{
__m128 m;
float f[4];
};

union f32x8
{
__m256 m;
float f[8];
};

template <typename T, size_t N>
void set_f32xN(T& vec, const float* v)
{
for (std::size_t i = 0; i < N; i++)
{
vec.f[i] = v[i];
}
}

template <typename T, size_t N>
class FloatMatrixN
{
public:
FloatMatrixN(const std::size_t h, const std::size_t w) : m_h(h), m_w(w)
{
m_num_arrays = static_cast<std::size_t>(
std::ceil(static_cast<float>(m_w) / static_cast<float>(N)));
m_w_actual = m_num_arrays * N;
T vec;
const std::vector<float> zeros(N, 0);
set_f32xN<T, N>(vec, zeros.data());
std::vector<T> array(m_h, vec);
m_data = std::vector<std::vector<T>>(m_num_arrays, array);
}

void set_value(const std::size_t i, const std::size_t j, const float val)
{
if (i < 0 || i >= m_h)
{
throw std::runtime_error("Index i is out of bound.");
}
if (j < 0 || j >= m_w)
{
throw std::runtime_error("Index j is out of bound.");
}
m_data[j / N][i].f[j % N] = val;
}

float get_value(const std::size_t i, const std::size_t j) const
{
return m_data[j / N][i].f[j % N];
}

void normalize()
{
const std::vector<float> zeros(N, 0);
T zero_vec;
set_f32xN<T, N>(zero_vec, zeros.data());
for (std::size_t i = 0; i < m_num_arrays; i++)
{
std::vector<T>& f32_vec = m_data[i];
T vec = zero_vec;
for (std::size_t j = 0; j < m_h; j++)
{
if constexpr (std::is_same_v<T, f32x4>)
{
vec.m += _mm_mul_ps(f32_vec[j].m, f32_vec[j].m);
}
else if constexpr (std::is_same_v<T, f32x8>)
{
vec.m += _mm256_mul_ps(f32_vec[j].m, f32_vec[j].m);
}
else
{
throw std::runtime_error{"Unsupported"};
}
}
if constexpr (std::is_same_v<T, f32x4>)
{
vec.m = 1.0f / _mm_sqrt_ps(vec.m);
}
else if constexpr (std::is_same_v<T, f32x8>)
{
vec.m = 1.0f / _mm256_sqrt_ps(vec.m);
}
for (std::size_t j = 0; j < m_h; j++)
{
f32_vec[j].m *= vec.m;
}
}
}

private:
std::vector<std::vector<T>> m_data;
std::size_t m_h;
std::size_t m_w;
std::size_t m_num_arrays;
std::size_t m_w_actual; // Multiples of N
};

typedef FloatMatrixN<f32x8, 8> FloatMatrix8;
typedef FloatMatrixN<f32x4, 4> FloatMatrix4;

template <typename T>
T convert_vectors_to_matrix(const std::vector<std::vector<float>>& data,
const std::size_t num_vectors,
const std::size_t vector_size)
{
T matrix{vector_size, num_vectors};
for (std::size_t i = 0; i < num_vectors; i++)
{
for (std::size_t j = 0; j < vector_size; j++)
{
matrix.set_value(j, i, data[i][j]);
}
}
return matrix;
}

template <typename T>
std::vector<std::vector<float>>
convert_matrix_to_vectors(const T& matrix, const std::size_t num_vectors,
const std::size_t vector_size)
{
std::vector<std::vector<float>> converted_data(num_vectors);
for (std::size_t i = 0; i < num_vectors; i++)
{
std::vector<float> row(vector_size);
for (std::size_t j = 0; j < vector_size; j++)
{
row[j] = matrix.get_value(j, i);
}
converted_data.at(i) = row;
}
return converted_data;
}

template <typename T>
inline void normalize(std::vector<std::vector<T>>& data)
{
for (std::size_t i = 0; i < data.size(); i++)
{
std::vector<T>& row = data[i];
float square_sum = 0;
for (std::size_t j = 0; j < row.size(); j++)
{
square_sum += row[j] * row[j];
}
for (std::size_t j = 0; j < row.size(); j++)
{
row[j] /= std::sqrt(square_sum);
}
}
}

template <typename T>
std::vector<std::valarray<T>>
convert_vectors_to_valarray(const std::vector<std::vector<T>>& data,
const std::size_t num_vectors,
const std::size_t vector_size)
{
std::vector<std::valarray<T>> converted_data(vector_size);
for (std::size_t i = 0; i < vector_size; i++)
{
std::valarray<T> row(num_vectors);
for (std::size_t j = 0; j < num_vectors; j++)
{
row[j] = data[j][i];
}
converted_data.at(i) = row;
}
return converted_data;
}

template <typename T>
std::vector<std::vector<T>>
convert_valarray_to_vectors(const std::vector<std::valarray<T>>& data,
const std::size_t num_vectors,
const std::size_t vector_size)
{
std::vector<std::vector<T>> converted_data(num_vectors);
for (std::size_t i = 0; i < num_vectors; i++)
{
std::vector<T> row(vector_size);
for (std::size_t j = 0; j < vector_size; j++)
{
row[j] = data[j][i];
}
converted_data.at(i) = row;
}
return converted_data;
}

template <typename T>
inline void normalize(std::vector<std::valarray<T>>& data)
{
std::valarray<T> square_sum(0.0f, data[0].size());
for (std::size_t i = 0; i < data.size(); i++)
{
square_sum += data[i] * data[i];
}
std::valarray<T> multiplier = 1.0f / std::sqrt(square_sum);
for (std::size_t i = 0; i < data.size(); i++)
{
data[i] *= multiplier;
}
}

template <typename T>
bool is_equivalent(const std::vector<std::vector<T>>& data_1,
const std::vector<std::vector<T>>& data_2, const T& atol)
{
if (data_1.size() != data_2.size())
{
return false;
}
for (std::size_t i = 0; i < data_1.size(); i++)
{
const std::vector<T>& row_1 = data_1.at(i);
const std::vector<T>& row_2 = data_2.at(i);
if (row_1.size() != row_2.size())
{
return false;
}
for (std::size_t j = 0; j < row_1.size(); j++)
{
if (std::abs(row_1.at(j) - row_2.at(j)) > atol)
{
std::cout << row_1.at(j) << " " << row_2.at(j) << std::endl;
return false;
}
}
}
return true;
}

int main()
{
const unsigned long random_seed{0};

const std::size_t num_vectors{25601};
const std::size_t vector_size{33};
const float atol{1e-7};

std::chrono::time_point<std::chrono::high_resolution_clock> start;
std::chrono::time_point<std::chrono::high_resolution_clock> end;

// Data for experiments.
const std::vector<std::vector<float>> vectors =
generate_vectors<float>(num_vectors, vector_size, random_seed);

// Make a copy of the data.
// num_vectors x vector_size
std::vector<std::vector<float>> normalized_vectors = vectors;
// Normalize.
start = std::chrono::high_resolution_clock::now();
normalize(normalized_vectors);
end = std::chrono::high_resolution_clock::now();
std::cout << std::setw(25) << "Baseline Elapsed Time: " << std::setw(8)
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end -
start)
.count()
<< " ns" << std::endl;

// vector_size x num_vectors
std::vector<std::valarray<float>> normalized_valarray =
convert_vectors_to_valarray(vectors, num_vectors, vector_size);
start = std::chrono::high_resolution_clock::now();
normalize(normalized_valarray);
end = std::chrono::high_resolution_clock::now();
std::cout << std::setw(25) << "Valarray Elapsed Time: " << std::setw(8)
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end -
start)
.count()
<< " ns" << std::endl;
assert(is_equivalent(normalized_vectors,
convert_valarray_to_vectors(normalized_valarray,
num_vectors, vector_size),
atol));

// vector_size x num_vectors
FloatMatrix4 normalized_matrix4 = convert_vectors_to_matrix<FloatMatrix4>(
vectors, num_vectors, vector_size);
start = std::chrono::high_resolution_clock::now();
normalized_matrix4.normalize();
end = std::chrono::high_resolution_clock::now();
std::cout << std::setw(25) << "SSE Elapsed Time: " << std::setw(8)
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end -
start)
.count()
<< " ns" << std::endl;
assert(is_equivalent(normalized_vectors,
convert_matrix_to_vectors<FloatMatrix4>(
normalized_matrix4, num_vectors, vector_size),
atol));

// vector_size x num_vectors
FloatMatrix8 normalized_matrix8 = convert_vectors_to_matrix<FloatMatrix8>(
vectors, num_vectors, vector_size);
start = std::chrono::high_resolution_clock::now();
normalized_matrix8.normalize();
end = std::chrono::high_resolution_clock::now();
std::cout << std::setw(25) << "AVX Elapsed Time: " << std::setw(8)
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end -
start)
.count()
<< " ns" << std::endl;
assert(is_equivalent(normalized_vectors,
convert_matrix_to_vectors<FloatMatrix8>(
normalized_matrix8, num_vectors, vector_size),
atol));
}

Build and run the application using Intel Core i9-9900K.

1
2
3
4
5
6
$ g++ vector_normalization.cpp -o vector_normalization -msse2 -mavx -std=c++17
$ ./vector_normalization
Baseline Elapsed Time: 9449853 ns
Valarray Elapsed Time: 8705188 ns
SSE Elapsed Time: 1787348 ns
AVX Elapsed Time: 848259 ns

We found that comparing to the baseline and std::valarray methods, vectorization using SSE and AVX achieves ~5x and ~10x speed up, respectively.

AVX C++ Standards Compliance

It seems that AVX has some compliance issues with C++11 and C++14. The following minimum AVX application encountered segmentation fault if the application was built with C++11 or C++14, but not with C++17 or C++20, on Intel Core i9-9900K.

avx_minimum.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// OK: g++ avx_minimum.cpp -o avx_minimum -mavx -std=c++2a
// OK: g++ avx_minimum.cpp -o avx_minimum -mavx -std=c++17
// Segmentation Fault: g++ avx_minimum.cpp -o avx_minimum -mavx -std=c++14
// Segmentation Fault: g++ avx_minimum.cpp -o avx_minimum -mavx -std=c++11

#include <immintrin.h>
#include <iostream>
#include <valarray>
#include <vector>

union f32x8
{
__m256 m;
float f[8];
};

int main()
{
f32x8 a;
std::size_t num_arrays = 2;
std::size_t h = 3;
std::vector<f32x8> array(h, a);
std::vector<std::vector<f32x8>> data(num_arrays, array);
for (std::size_t i = 0; i < num_arrays; i++)
{
for (std::size_t j = 0; j < h; j++)
{
std::cout << "i=" << i << ", "
<< "j=" << j << std::endl;
data[i][j].m += _mm256_mul_ps(data[i][j].m, data[i][j].m);
}
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
$ g++ avx_minimum.cpp -o avx_minimum -mavx -std=c++17
$ ./avx_minimum
i=0, j=0
i=0, j=1
i=0, j=2
i=1, j=0
i=1, j=1
i=1, j=2
$ g++ avx_minimum.cpp -o avx_minimum -mavx -std=c++14
$ ./avx_minimum
i=0, j=0
i=0, j=1
i=0, j=2
i=1, j=0
Segmentation fault (core dumped)

Switching the compiler from GNU GCC to LLVM Clang resulted in the same phenomenon.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
$ clang++ avx_minimum.cpp -o avx_minimum -mavx -std=c++17
$ ./avx_minimum
i=0, j=0
i=0, j=1
i=0, j=2
i=1, j=0
i=1, j=1
i=1, j=2
$ clang++ avx_minimum.cpp -o avx_minimum -mavx -std=c++14
$ ./avx_minimum
i=0, j=0
i=0, j=1
i=0, j=2
i=1, j=0
Segmentation fault (core dumped)

Using std::valarray instead of std::vector as the container for f32x8 also resulted in the weird segmentation fault problem. But this time none of C++11 or C++14, C++17 and C++20 worked.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#include <immintrin.h>
#include <iostream>
#include <valarray>
#include <vector>

union f32x8
{
__m256 m;
float f[8];
};

int main()
{
f32x8 a;
std::size_t num_arrays = 2;
std::size_t h = 3;
std::valarray<f32x8> array(a, h);
std::vector<std::valarray<f32x8>> data(num_arrays, array);
for (std::size_t i = 0; i < num_arrays; i++)
{
for (std::size_t j = 0; j < h; j++)
{
std::cout << "i=" << i << ", "
<< "j=" << j << std::endl;
data[i][j].m += _mm256_mul_ps(data[i][j].m, data[i][j].m);
}
}
}
1
2
3
4
$ g++ avx_minimum.cpp -o avx_minimum -mavx -std=c++17
$ ./avx_minimum
i=0, j=0
Segmentation fault (core dumped)

However, this problem does not seem to be reproducible with MSVC compiler on Windows.

C++ union might have undefined behavior if the member if not active by assigning values before access. This is exactly what’s happening in our applications. For example, the __m256 typed member of f32x8 objects were never assigned. To access the __m256 typed member of a f32x8 object when the float [] typed member is active, we could also try:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#include <immintrin.h>
#include <iostream>
#include <valarray>
#include <vector>

union f32x8
{
__m256 m;
float f[8];
};

int main()
{
f32x8 a;
for (int i{0}; i < 8; ++i)
{
a.f[i] = i;
}
std::size_t num_arrays = 2;
std::size_t h = 3;
std::vector<f32x8> array(h, a);
std::vector<std::vector<f32x8>> data(num_arrays, array);
for (std::size_t i = 0; i < num_arrays; i++)
{
for (std::size_t j = 0; j < h; j++)
{
std::cout << "i=" << i << ", "
<< "j=" << j << std::endl;
*reinterpret_cast<__m256*>(data[i][j].f) +=
_mm256_mul_ps(*reinterpret_cast<__m256*>(data[i][j].f),
*reinterpret_cast<__m256*>(data[i][j].f));
}
}
}

However, because reinterpret_cast is also not type-safe, this trick does not resolve the segmentation fault problem we encountered previously.

In fact, it is not necessary to use union. We could set the __m256 values using intrinsic instructions such as _mm256_set_ps. However, we still encountered the same segmentation fault problem.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#include <immintrin.h>
#include <iostream>
#include <valarray>
#include <vector>

int main()
{
__m256 m;
m = _mm256_set_ps(0, 1, 2, 3, 4, 5, 6, 7);
std::size_t num_arrays = 2;
std::size_t h = 3;
std::vector<__m256> array(h, m);
std::vector<std::vector<__m256>> data(num_arrays, array);
for (std::size_t i = 0; i < num_arrays; i++)
{
for (std::size_t j = 0; j < h; j++)
{
std::cout << "i=" << i << ", "
<< "j=" << j << std::endl;
data[i][j] += _mm256_mul_ps(data[i][j], data[i][j]);
}
}
}

References

Author

Lei Mao

Posted on

01-24-2022

Updated on

07-02-2022

Licensed under


Comments