Let's assume that we have a function that multiplies two arrays of 1000000 doubles each. In C/C++ the function looks like this:
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
The compiler produces the following assembly with -O2
:
mul_c(double*, double*):
xor eax, eax
.L2:
movsd xmm0, QWORD PTR [rdi+rax]
mulsd xmm0, QWORD PTR [rsi+rax]
movsd QWORD PTR [rdi+rax], xmm0
add rax, 8
cmp rax, 8000000
jne .L2
rep ret
From the above assembly it seems that the compiler uses the SIMD instructions, but it only multiplies one double each iteration. So I decided to write the same function in inline assembly instead, where I make full use of the xmm0
register and multiply two doubles in one go:
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"add rax, 16 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
After measuring the execution time individually for both of these functions, it seems that both of them takes 1 ms to complete:
> gcc -O2 main.cpp
> ./a.out < input
mul_c: 1 ms
mul_asm: 1 ms
[a lot of doubles...]
I expected the SIMD implementation to be atleast twice as fast (0 ms) as there is only half the amount of multiplications/memory instructions.
So my question is: Why isn't the SIMD implementation faster than the ordinary C/C++ implementation when the SIMD implementation only does half the amount of multiplications/memory instructions?
Here's the full program:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"add rax, 16 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
int main()
{
struct timeval t1;
struct timeval t2;
unsigned long long time;
double* a = (double*)malloc(sizeof(double) * 1000000);
double* b = (double*)malloc(sizeof(double) * 1000000);
double* c = (double*)malloc(sizeof(double) * 1000000);
for (int i = 0; i != 1000000; ++i)
{
double v;
scanf("%lf", &v);
a[i] = v;
b[i] = v;
c[i] = v;
}
gettimeofday(&t1, NULL);
mul_c(a, b);
gettimeofday(&t2, NULL);
time = 1000 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000;
printf("mul_c: %llu ms\n", time);
gettimeofday(&t1, NULL);
mul_asm(b, c);
gettimeofday(&t2, NULL);
time = 1000 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000;
printf("mul_asm: %llu ms\n\n", time);
for (int i = 0; i != 1000000; ++i)
{
printf("%lf\t\t\t%lf\n", a[i], b[i]);
}
return 0;
}
I also tried to to make use of all xmm
registers (0-7) and remove instruction dependencies to get better parallell computing:
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"movupd xmm1, xmmword ptr [rdi+rax+16] \n\t"
"movupd xmm2, xmmword ptr [rdi+rax+32] \n\t"
"movupd xmm3, xmmword ptr [rdi+rax+48] \n\t"
"movupd xmm4, xmmword ptr [rdi+rax+64] \n\t"
"movupd xmm5, xmmword ptr [rdi+rax+80] \n\t"
"movupd xmm6, xmmword ptr [rdi+rax+96] \n\t"
"movupd xmm7, xmmword ptr [rdi+rax+112] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"mulpd xmm1, xmmword ptr [rsi+rax+16] \n\t"
"mulpd xmm2, xmmword ptr [rsi+rax+32] \n\t"
"mulpd xmm3, xmmword ptr [rsi+rax+48] \n\t"
"mulpd xmm4, xmmword ptr [rsi+rax+64] \n\t"
"mulpd xmm5, xmmword ptr [rsi+rax+80] \n\t"
"mulpd xmm6, xmmword ptr [rsi+rax+96] \n\t"
"mulpd xmm7, xmmword ptr [rsi+rax+112] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"movupd xmmword ptr [rdi+rax+16], xmm1 \n\t"
"movupd xmmword ptr [rdi+rax+32], xmm2 \n\t"
"movupd xmmword ptr [rdi+rax+48], xmm3 \n\t"
"movupd xmmword ptr [rdi+rax+64], xmm4 \n\t"
"movupd xmmword ptr [rdi+rax+80], xmm5 \n\t"
"movupd xmmword ptr [rdi+rax+96], xmm6 \n\t"
"movupd xmmword ptr [rdi+rax+112], xmm7 \n\t"
"add rax, 128 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
But it still runs at 1 ms, the same speed as the ordinary C/C++ implementation.
UPDATES
As suggested by answers/comments, I've implemented another way of measuring the execution time:
#include <stdio.h>
#include <stdlib.h>
void mul_c(double* a, double* b)
{
for (int i = 0; i != 1000000; ++i)
{
a[i] = a[i] * b[i];
}
}
void mul_asm(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"add rax, 16 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
void mul_asm2(double* a, double* b)
{
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"0: \n\t"
"movupd xmm0, xmmword ptr [rdi+rax] \n\t"
"movupd xmm1, xmmword ptr [rdi+rax+16] \n\t"
"movupd xmm2, xmmword ptr [rdi+rax+32] \n\t"
"movupd xmm3, xmmword ptr [rdi+rax+48] \n\t"
"movupd xmm4, xmmword ptr [rdi+rax+64] \n\t"
"movupd xmm5, xmmword ptr [rdi+rax+80] \n\t"
"movupd xmm6, xmmword ptr [rdi+rax+96] \n\t"
"movupd xmm7, xmmword ptr [rdi+rax+112] \n\t"
"mulpd xmm0, xmmword ptr [rsi+rax] \n\t"
"mulpd xmm1, xmmword ptr [rsi+rax+16] \n\t"
"mulpd xmm2, xmmword ptr [rsi+rax+32] \n\t"
"mulpd xmm3, xmmword ptr [rsi+rax+48] \n\t"
"mulpd xmm4, xmmword ptr [rsi+rax+64] \n\t"
"mulpd xmm5, xmmword ptr [rsi+rax+80] \n\t"
"mulpd xmm6, xmmword ptr [rsi+rax+96] \n\t"
"mulpd xmm7, xmmword ptr [rsi+rax+112] \n\t"
"movupd xmmword ptr [rdi+rax], xmm0 \n\t"
"movupd xmmword ptr [rdi+rax+16], xmm1 \n\t"
"movupd xmmword ptr [rdi+rax+32], xmm2 \n\t"
"movupd xmmword ptr [rdi+rax+48], xmm3 \n\t"
"movupd xmmword ptr [rdi+rax+64], xmm4 \n\t"
"movupd xmmword ptr [rdi+rax+80], xmm5 \n\t"
"movupd xmmword ptr [rdi+rax+96], xmm6 \n\t"
"movupd xmmword ptr [rdi+rax+112], xmm7 \n\t"
"add rax, 128 \n\t"
"cmp rax, 8000000 \n\t"
"jne 0b \n\t"
".att_syntax noprefix \n\t"
:
: "D" (a), "S" (b)
: "memory", "cc"
);
}
unsigned long timestamp()
{
unsigned long a;
asm volatile
(
".intel_syntax noprefix \n\t"
"xor rax, rax \n\t"
"xor rdx, rdx \n\t"
"RDTSCP \n\t"
"shl rdx, 32 \n\t"
"or rax, rdx \n\t"
".att_syntax noprefix \n\t"
: "=a" (a)
:
: "memory", "cc"
);
return a;
}
int main()
{
unsigned long t1;
unsigned long t2;
double* a;
double* b;
a = (double*)malloc(sizeof(double) * 1000000);
b = (double*)malloc(sizeof(double) * 1000000);
for (int i = 0; i != 1000000; ++i)
{
double v;
scanf("%lf", &v);
a[i] = v;
b[i] = v;
}
t1 = timestamp();
mul_c(a, b);
//mul_asm(a, b);
//mul_asm2(a, b);
t2 = timestamp();
printf("mul_c: %lu cycles\n\n", t2 - t1);
for (int i = 0; i != 1000000; ++i)
{
printf("%lf\t\t\t%lf\n", a[i], b[i]);
}
return 0;
}
When I run the program with this measurement, I get this result:
mul_c: ~2163971628 cycles
mul_asm: ~2532045184 cycles
mul_asm2: ~5230488 cycles <-- what???
Two things are worth a notice here, first of all, the cycles count vary a LOT, and I assume that's because of the operating system allowing other processes to run inbetween. Is there any way to prevent that or only count the cycles while my program is executed? Also, mul_asm2
produces identical output compared to the other two, but it so much faster, how?
I tried Z boson's program on my system together with my 2 implementations and got the following result:
> g++ -O2 -fopenmp main.cpp
> ./a.out
mul time 1.33, 18.08 GB/s
mul_SSE time 1.13, 21.24 GB/s
mul_SSE_NT time 1.51, 15.88 GB/s
mul_SSE_OMP time 0.79, 30.28 GB/s
mul_SSE_v2 time 1.12, 21.49 GB/s
mul_v2 time 1.26, 18.99 GB/s
mul_asm time 1.12, 21.50 GB/s
mul_asm2 time 1.09, 22.08 GB/s
Best Answer
There was a major bug in the timing function I used for previous benchmarks. This grossly underestimated the bandwidth without vectorization as well as other measurements. Additionally, there was another problem that was overestimating the bandwidth due to COW on the array that was read but not written to. Finally, the maximum bandwidth I used was incorrect. I have updated my answer with the corrections and I have left the old answer at the end of this answer.
Your operation is memory bandwidth bound. This means the CPU is spending most of its time waiting on slow memory reads and writes. An excellent explanation for this can be found here: Why vectorizing the loop does not have performance improvement.
However, I have to disagree slightly with one statement in that answer.
In fact, vectorization
, unrolling,and multiple threads can significantly increase the bandwidth even in memory bandwidth bound operations. The reason is that it is difficult to obtain the maximum memory bandwidth. A good explanation for this can be found here: https://stackoverflow.com/a/25187492/2542702.The rest of my answer will show how vectorization and multiple threads can get closer to the maximum memory bandwidth.
My test system: Ubuntu 16.10, Skylake ([email protected]), 32GB RAM, dual channel DDR4@2400 GHz. The maximum bandwidth from my system is 38.4 GB/s.
From the code below I produce the following tables. I set the number of thread using OMP_NUM_THREADS e.g.
export OMP_NUM_THREADS=4
. The efficiency isbandwidth/max_bandwidth
.After several iterations of running due to uncertainties in the measurements I have formed the following conclusions:
The solution that gives the best bandwidth is scalar operations with two threads.
The code I used to benchmark:
The old solution with the timing bug
The modern solution for inline assembly is to use intrinsics. There are still cases where one needs inline assembly but this is not one of them.
One intrinsics solution for you inline assembly approach is simply:
Let me define some test code
Now the first test
So with
-O2
which does not vectorize loops we see that the intrinsic SSE version is much faster than the plain C solutionmul
.efficiency = bandwith_measured/max_bandwidth
where the max is 34.1 GB/s for my system.Second test
With
-O3
vectorizes the loop and the intrinsic function offers essentially no advantage.Third test
With
-funroll-loops
GCC unrolls the loops eight times and we see a significant improvement except for the non-temporal store solution and not real advantage for OpenMP solution.Before unrolling the loop the assembly for
mul
wiht-O3
isWith
-O3 -funroll-loops
the assembly formul
is:Fourth test
Now the non-intrinsic function is the fastest (excluding the OpenMP version).
So there is no reason to use intrinsics or inline assembly in this case because we can get the best performance with appropriate compiler options (e.g.
-O3
,-funroll-loops
,-mavx
).Test system: Ubuntu 16.10, Skylake ([email protected]), 32GB RAM. Maximum memory bandwidth (34.1 GB/s) https://ark.intel.com/products/88967/Intel-Core-i7-6700HQ-Processor-6M-Cache-up-to-3_50-GHz
Here is another solution worth considering. The
cmp
instruction is not necessary if we count from -N up to zero and access the arrays asN+i
. GCC should have fixed this a long time ago. It eliminates one instruction (though due to macro-op fusion the cmp and jmp often count as one micro-op).Assembly with
-O3
This optimization will only possibly be helpful the arrays fit e.g. the L1 cache i.e. not reading from main memory.
I finally found a way to get the plain C solution to not generate the
cmp
instruction.And then call the function from a separate object file like this
mul_v2(&a[N],&b[N])
so this is perhaps the best solution. However, if you call the function from the same object file (translation unit) as the one it's defined in the GCC generates thecmp
instruction again.Also,
still generates the
cmp
instruction and generates the same assembly as themul
function.The function
mul_SSE_NT
is silly. It uses non-temporal stores which are only useful when only writing to memory but since the function reads and writes to the same address non-temporal stores are not only useless they give inferior results.Previous versions of this answer were getting the wrong bandwidth. The reason was when the arrays were not initialized.