C++ Performance – Why Plain C++ Code is 10 Times Faster than Inline Assembler

assemblyc++performance

These two code snippets do the same thing: Adding two float arrays together and storing the result back into them.

Inline Assembler:

void vecAdd_SSE(float* v1, float* v2) { 
    _asm {
        mov esi, v1
        mov edi, v2
        movups xmm0, [esi]
        movups xmm1, [edi]
        addps xmm0, xmm1
        movups [esi], xmm0
        movups [edi], xmm0
    }
}

Plain C++ Code:

void vecAdd_Std(float* v1, float* v2) {
    v1[0] = v1[0]+ v2[0];
    v1[1] = v1[1]+ v2[1];
    v1[2] = v1[2]+ v2[2];
    v1[3] = v1[3]+ v2[3];

    v2[0] = v1[0];
    v2[1] = v1[1];
    v2[2] = v1[2];
    v2[3] = v1[3];
}

Disassembly for C++ Code (Disassembly made in Debug mode because i cannot view the Disassembly in Release mode for some reason):

 void vecAdd_Std(float* v1, float* v2) {
 push        ebp  
 mov         ebp,esp  
 sub         esp,0C0h  
 push        ebx  
 push        esi  
 push        edi  
 lea         edi,[ebp-0C0h]  
 mov         ecx,30h  
 mov         eax,0CCCCCCCCh  
 rep stos    dword ptr es:[edi]  

    v1[0] = v1[0]+ v2[0];
 mov         eax,4  
 imul        ecx,eax,0  
 mov         edx,4  
 imul        eax,edx,0  
 mov         edx,dword ptr [v1]  
 mov         esi,dword ptr [v2]  
 movss       xmm0,dword ptr [edx+ecx]  
 addss       xmm0,dword ptr [esi+eax]  
 mov         eax,4  
 imul        ecx,eax,0  
 mov         edx,dword ptr [v1]  
 movss       dword ptr [edx+ecx],xmm0  
    v1[1] = v1[1]+ v2[1];
 mov         eax,4  
 shl         eax,0  
    v1[1] = v1[1]+ v2[1];
 mov         ecx,4  
 shl         ecx,0  
 mov         edx,dword ptr [v1]  
 mov         esi,dword ptr [v2]  
 movss       xmm0,dword ptr [edx+eax]  
 addss       xmm0,dword ptr [esi+ecx]  
 mov         eax,4  
 shl         eax,0  
 mov         ecx,dword ptr [v1]  
 movss       dword ptr [ecx+eax],xmm0  
    v1[2] = v1[2]+ v2[2];
 mov         eax,4  
 shl         eax,1  
 mov         ecx,4  
 shl         ecx,1  
 mov         edx,dword ptr [v1]  
 mov         esi,dword ptr [v2]  
 movss       xmm0,dword ptr [edx+eax]  
 addss       xmm0,dword ptr [esi+ecx]  
 mov         eax,4  
 shl         eax,1  
 mov         ecx,dword ptr [v1]  
 movss       dword ptr [ecx+eax],xmm0  
    v1[3] = v1[3]+ v2[3];
 mov         eax,4  
 imul        ecx,eax,3  
 mov         edx,4  
 imul        eax,edx,3  
 mov         edx,dword ptr [v1]  
 mov         esi,dword ptr [v2]  
 movss       xmm0,dword ptr [edx+ecx]  
 addss       xmm0,dword ptr [esi+eax]  
 mov         eax,4  
 imul        ecx,eax,3  
 mov         edx,dword ptr [v1]  
 movss       dword ptr [edx+ecx],xmm0  

    v2[0] = v1[0];
 mov         eax,4  
 imul        ecx,eax,0  
 mov         edx,4  
 imul        eax,edx,0  
 mov         edx,dword ptr [v2]  
 mov         esi,dword ptr [v1]  
 mov         ecx,dword ptr [esi+ecx]  
 mov         dword ptr [edx+eax],ecx  
    v2[1] = v1[1];
 mov         eax,4  
 shl         eax,0  
 mov         ecx,4  
 shl         ecx,0  
 mov         edx,dword ptr [v2]  
 mov         esi,dword ptr [v1]  
 mov         eax,dword ptr [esi+eax]  
 mov         dword ptr [edx+ecx],eax  
    v2[2] = v1[2];
 mov         eax,4  
 shl         eax,1  
 mov         ecx,4  
 shl         ecx,1  
 mov         edx,dword ptr [v2]  
 mov         esi,dword ptr [v1]  
 mov         eax,dword ptr [esi+eax]  
 mov         dword ptr [edx+ecx],eax  
    v2[3] = v1[3];
 mov         eax,4  
 imul        ecx,eax,3  
 mov         edx,4  
 imul        eax,edx,3  
 mov         edx,dword ptr [v2]  
 mov         esi,dword ptr [v1]  
 mov         ecx,dword ptr [esi+ecx]  
 mov         dword ptr [edx+eax],ecx  

}

Now I made a time measurement on those to functions and noticed that the inline assembler code takes approximately 10 times longer (in Release mode).
Does anybody know why?

Best Answer

On my machine (VS2015 64-bit mode), the compiler inlines vecAdd_Std and produces

00007FF625921C8F  vmovups     xmm1,xmmword ptr [__xmm@4100000040c000004080000040000000 (07FF625929D60h)]  
00007FF625921C97  vmovups     xmm4,xmm1  
00007FF625921C9B  vcvtss2sd   xmm1,xmm1,xmm4  

Test code

int main() {
    float x[4] = {1.0, 2.0, 3.0, 4.0};
    float y[4] = {1.0, 2.0, 3.0, 4.0};

    vecAdd_Std(x, y);

    std::cout << x[0];
}
Related Question