c++ - SSE vector wrapper type performance compared to bare _

i found interesting gamasutra article simd pitfalls, states not possible reach performance of "pure" __m128 type wrapper types. skeptical, downloaded project files , fabricated comparable test case.

it turned out (for surprise) wrapper version slower. since don't want talk thin air, test cases following:

in 1st case vec4 simple alias of __m128 type operators:

#include <xmmintrin.h> #include <emmintrin.h>  using vec4 = __m128;  inline __m128 vload(float f) {     return _mm_set_ps(f, f, f, f); };  inline vec4& operator+=(vec4 &va, vec4 vb) {     return (va = _mm_add_ps(va, vb)); };  inline vec4& operator*=(vec4 &va, vec4 vb) {     return (va = _mm_mul_ps(va, vb)); };  inline vec4 operator+(vec4 va, vec4 vb) {     return _mm_add_ps(va, vb); };  inline vec4 operator-(vec4 va, vec4 vb) {     return _mm_sub_ps(va, vb); };  inline vec4 operator*(vec4 va, vec4 vb) {     return _mm_mul_ps(va, vb); };

in 2nd case vec4 lightweight wrapper around __m128. not complete wrapper, short sketch covers issue. operators wrap same intrinsics, difference (since 16-byte alignment cannot applied on arguments) take vec4 const reference:

#include <xmmintrin.h> #include <emmintrin.h>  struct vec4 {     __m128 simd;      inline vec4() = default;     inline vec4(const vec4&) = default;     inline vec4& operator=(const vec4&) = default;      inline vec4(__m128 s)         : simd(s)     {}      inline operator __m128() const     {         return simd;     }      inline operator __m128&()     {         return simd;     } };  inline __m128 vload(float f) {     return _mm_set_ps(f, f, f, f); };  inline vec4 vadd(const vec4 &va, const vec4 &vb) {     return _mm_add_ps(va, vb);     // return _mm_add_ps(va.simd, vb.simd); // doesn't make difference };  inline vec4 vsub(const vec4 &va, const vec4 &vb) {     return _mm_sub_ps(va, vb);     // return _mm_sub_ps(va.simd, vb.simd); // doesn't make difference };  inline vec4 vmul(const vec4 &va, const vec4 &vb) {     return _mm_mul_ps(va, vb);     // return _mm_mul_ps(va.simd, vb.simd); // doesn't make difference };

and here test kernel produces different performance different versions of vec4:

#include <xmmintrin.h> #include <emmintrin.h>  struct eqstate {     // filter #1 (low band)      vec4  lf;       // frequency     vec4  f1p0;     // poles ...     vec4  f1p1;          vec4  f1p2;     vec4  f1p3;      // filter #2 (high band)      vec4  hf;       // frequency     vec4  f2p0;     // poles ...     vec4  f2p1;     vec4  f2p2;     vec4  f2p3;      // sample history buffer      vec4  sdm1;     // sample data minus 1     vec4  sdm2;     //                   2     vec4  sdm3;     //                   3      // gain controls      vec4  lg;       // low  gain     vec4  mg;       // mid  gain     vec4  hg;       // high gain  };    static float vsaf = (1.0f / 4294967295.0f);   // small amount (denormal fix) static vec4 vsa = vload(vsaf);  vec4 testeq(eqstate* es, vec4& sample) {     // locals      vec4  l,m,h;      // low / mid / high - sample values      // filter #1 (lowpass)      es->f1p0  += (es->lf * (sample   - es->f1p0)) + vsa;     //es->f1p0 = vadd(es->f1p0, vadd(vmul(es->lf, vsub(sample, es->f1p0)), vsa));      es->f1p1  += (es->lf * (es->f1p0 - es->f1p1));     //es->f1p1 = vadd(es->f1p1, vmul(es->lf, vsub(es->f1p0, es->f1p1)));      es->f1p2  += (es->lf * (es->f1p1 - es->f1p2));     //es->f1p2 = vadd(es->f1p2, vmul(es->lf, vsub(es->f1p1, es->f1p2)));      es->f1p3  += (es->lf * (es->f1p2 - es->f1p3));     //es->f1p3 = vadd(es->f1p3, vmul(es->lf, vsub(es->f1p2, es->f1p3)));      l          = es->f1p3;      // filter #2 (highpass)      es->f2p0  += (es->hf * (sample   - es->f2p0)) + vsa;     //es->f2p0 = vadd(es->f2p0, vadd(vmul(es->hf, vsub(sample, es->f2p0)), vsa));      es->f2p1  += (es->hf * (es->f2p0 - es->f2p1));     //es->f2p1 = vadd(es->f2p1, vmul(es->hf, vsub(es->f2p0, es->f2p1)));      es->f2p2  += (es->hf * (es->f2p1 - es->f2p2));     //es->f2p2 = vadd(es->f2p2, vmul(es->hf, vsub(es->f2p1, es->f2p2)));      es->f2p3  += (es->hf * (es->f2p2 - es->f2p3));     //es->f2p3 = vadd(es->f2p3, vmul(es->hf, vsub(es->f2p2, es->f2p3)));      h          = es->sdm3 - es->f2p3;     //h = vsub(es->sdm3, es->f2p3);      // calculate midrange (signal - (low + high))      m          = es->sdm3 - (h + l);     //m = vsub(es->sdm3, vadd(h, l));      // scale, combine , store      l         *= es->lg;     m         *= es->mg;     h         *= es->hg;      //l = vmul(l, es->lg);     //m = vmul(m, es->mg);     //h = vmul(h, es->hg);      // shuffle history buffer       es->sdm3   = es->sdm2;     es->sdm2   = es->sdm1;     es->sdm1   = sample;                      // return result      return(l + m + h);     //return(vadd(l, vadd(m, h))); }  //make these globals enforce function call; static vec4 sample[1024], result[1024]; static eqstate es;  #include <chrono> #include <iostream>  int main() {     auto t0 = std::chrono::high_resolution_clock::now();      (int ii=0; ii<1024; ii++)     {         result[ii] = testeq(&es, sample[ii]);     }      auto t1 = std::chrono::high_resolution_clock::now();     auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();     std::cout << "timing: " << t << '\n';      std::cin.get();      return 0; }

link working code

https://godbolt.org/g/fz8x0n

msvc 2015 generated assembly 1st version:

;   comdat ?testeq@@ya?at__m128@@paueqstate@@aat1@@z _text   segment ?testeq@@ya?at__m128@@paueqstate@@aat1@@z proc      ; testeq, comdat ; _es$dead$ = ecx ; _sample$ = edx     vmovaps xmm0, xmmword ptr [edx]     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16     vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?vsa@@3t__m128@@a     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16     vmovaps xmmword ptr ?es@@3ueqstate@@a+16, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32     vmovaps xmmword ptr ?es@@3ueqstate@@a+32, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48     vmovaps xmmword ptr ?es@@3ueqstate@@a+48, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+64     vmulps  xmm0, xmm0, xmm2     vaddps  xmm4, xmm0, xmmword ptr ?es@@3ueqstate@@a+64     vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a+80     vmovaps xmm1, xmmword ptr ?es@@3ueqstate@@a+192     vmovaps xmmword ptr ?es@@3ueqstate@@a+64, xmm4     vmovaps xmm0, xmmword ptr [edx]     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?vsa@@3t__m128@@a     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96     vmovaps xmmword ptr ?es@@3ueqstate@@a+96, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112     vmovaps xmmword ptr ?es@@3ueqstate@@a+112, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128     vmovaps xmmword ptr ?es@@3ueqstate@@a+128, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144     vsubps  xmm2, xmm1, xmm0     vmovaps xmmword ptr ?es@@3ueqstate@@a+144, xmm0     vmovaps xmm0, xmmword ptr ?es@@3ueqstate@@a+176     vmovaps xmmword ptr ?es@@3ueqstate@@a+192, xmm0     vmovaps xmm0, xmmword ptr ?es@@3ueqstate@@a+160     vmovaps xmmword ptr ?es@@3ueqstate@@a+176, xmm0     vmovaps xmm0, xmmword ptr [edx]     vmovaps xmmword ptr ?es@@3ueqstate@@a+160, xmm0     vaddps  xmm0, xmm4, xmm2     vsubps  xmm0, xmm1, xmm0     vmulps  xmm1, xmm0, xmmword ptr ?es@@3ueqstate@@a+224     vmulps  xmm0, xmm2, xmmword ptr ?es@@3ueqstate@@a+240     vaddps  xmm1, xmm1, xmm0     vmulps  xmm0, xmm4, xmmword ptr ?es@@3ueqstate@@a+208     vaddps  xmm0, xmm1, xmm0     ret 0 ?testeq@@ya?at__m128@@paueqstate@@aat1@@z endp      ; testeq

msvc 2015 generated assembly 2nd version:

?testeq@@ya?auvec4@vmath@@paueqstate@@aau12@@z proc ; testeq, comdat ; ___$returnudt$ = ecx ; _es$dead$ = edx     push    ebx     mov ebx, esp     sub esp, 8     , esp, -8                 ; fffffff8h     add esp, 4     push    ebp     mov ebp, dword ptr [ebx+4]     mov eax, dword ptr _sample$[ebx]     vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a     vmovaps xmm1, xmmword ptr ?es@@3ueqstate@@a+192     mov dword ptr [esp+4], ebp     vmovaps xmm0, xmmword ptr [eax]     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?vsa@@3uvec4@vmath@@a     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16     vmovaps xmmword ptr ?es@@3ueqstate@@a+16, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32     vmovaps xmmword ptr ?es@@3ueqstate@@a+32, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48     vmovaps xmmword ptr ?es@@3ueqstate@@a+48, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+64     vmulps  xmm0, xmm0, xmm2     vaddps  xmm4, xmm0, xmmword ptr ?es@@3ueqstate@@a+64     vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a+80     vmovaps xmmword ptr ?es@@3ueqstate@@a+64, xmm4     vmovaps xmm0, xmmword ptr [eax]     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?vsa@@3uvec4@vmath@@a     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96     vmovaps xmmword ptr ?es@@3ueqstate@@a+96, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112     vmovaps xmmword ptr ?es@@3ueqstate@@a+112, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128     vmovaps xmmword ptr ?es@@3ueqstate@@a+128, xmm0     vsubps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144     vmulps  xmm0, xmm0, xmm2     vaddps  xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144     vsubps  xmm2, xmm1, xmm0     vmovaps xmmword ptr ?es@@3ueqstate@@a+144, xmm0     vaddps  xmm0, xmm2, xmm4     vsubps  xmm0, xmm1, xmm0     vmulps  xmm1, xmm0, xmmword ptr ?es@@3ueqstate@@a+224     vmovdqu xmm0, xmmword ptr ?es@@3ueqstate@@a+176     vmovdqu xmmword ptr ?es@@3ueqstate@@a+192, xmm0     vmovdqu xmm0, xmmword ptr ?es@@3ueqstate@@a+160     vmovdqu xmmword ptr ?es@@3ueqstate@@a+176, xmm0     vmovdqu xmm0, xmmword ptr [eax]     vmovdqu xmmword ptr ?es@@3ueqstate@@a+160, xmm0     vmulps  xmm0, xmm4, xmmword ptr ?es@@3ueqstate@@a+208     vaddps  xmm1, xmm0, xmm1     vmulps  xmm0, xmm2, xmmword ptr ?es@@3ueqstate@@a+240     vaddps  xmm0, xmm1, xmm0     vmovaps xmmword ptr [ecx], xmm0     mov eax, ecx     pop ebp     mov esp, ebx     pop ebx     ret 0 ?testeq@@ya?auvec4@vmath@@paueqstate@@aau12@@z endp ; testeq

the produced assembly of 2nd version longer , slower. not strictly related visual studio, since clang 3.8 produces similar performance results.

clang 3.8 generated assembly 1st version:

"?testeq@@yat__m128@@paueqstate@@aat1@@z": # @"\01?testeq@@yat__m128@@paueqstate@@aat1@@z" lfunc_begin0: ltmp0: # bb#0:                                 # %entry     movl    8(%esp), %eax     movl    4(%esp), %ecx     vmovaps _vsa, %xmm0     vmovaps (%ecx), %xmm1     vmovaps 16(%ecx), %xmm2     vmovaps (%eax), %xmm3     vsubps  %xmm2, %xmm3, %xmm3     vmulps  %xmm3, %xmm1, %xmm3     vaddps  %xmm3, %xmm0, %xmm3     vaddps  %xmm3, %xmm2, %xmm2     vmovaps %xmm2, 16(%ecx)     vmovaps 32(%ecx), %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  %xmm2, %xmm1, %xmm2     vaddps  %xmm2, %xmm3, %xmm2     vmovaps %xmm2, 32(%ecx)     vmovaps 48(%ecx), %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  %xmm2, %xmm1, %xmm2     vaddps  %xmm2, %xmm3, %xmm2     vmovaps %xmm2, 48(%ecx)     vmovaps 64(%ecx), %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  %xmm2, %xmm1, %xmm1     vaddps  %xmm1, %xmm3, %xmm1     vmovaps %xmm1, 64(%ecx)     vmovaps 80(%ecx), %xmm2     vmovaps 96(%ecx), %xmm3     vmovaps (%eax), %xmm4     vsubps  %xmm3, %xmm4, %xmm4     vmulps  %xmm4, %xmm2, %xmm4     vaddps  %xmm4, %xmm0, %xmm0     vaddps  %xmm0, %xmm3, %xmm0     vmovaps %xmm0, 96(%ecx)     vmovaps 112(%ecx), %xmm3     vsubps  %xmm3, %xmm0, %xmm0     vmulps  %xmm0, %xmm2, %xmm0     vaddps  %xmm0, %xmm3, %xmm0     vmovaps %xmm0, 112(%ecx)     vmovaps 128(%ecx), %xmm3     vsubps  %xmm3, %xmm0, %xmm0     vmulps  %xmm0, %xmm2, %xmm0     vaddps  %xmm0, %xmm3, %xmm0     vmovaps %xmm0, 128(%ecx)     vmovaps 144(%ecx), %xmm3     vsubps  %xmm3, %xmm0, %xmm0     vmulps  %xmm0, %xmm2, %xmm0     vaddps  %xmm0, %xmm3, %xmm0     vmovaps %xmm0, 144(%ecx)     vmovaps 192(%ecx), %xmm2     vsubps  %xmm0, %xmm2, %xmm0     vaddps  %xmm0, %xmm1, %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  208(%ecx), %xmm1, %xmm1     vmulps  224(%ecx), %xmm2, %xmm2     vmulps  240(%ecx), %xmm0, %xmm0     vmovaps 176(%ecx), %xmm3     vmovaps %xmm3, 192(%ecx)     vmovaps 160(%ecx), %xmm3     vmovaps %xmm3, 176(%ecx)     vmovaps (%eax), %xmm3     vmovaps %xmm3, 160(%ecx)     vaddps  %xmm2, %xmm0, %xmm0     vaddps  %xmm0, %xmm1, %xmm0     retl lfunc_end0:

clang 3.8 generated assembly 2nd version:

"?testeq@@ya?auvec4@@paueqstate@@aau1@@z": # @"\01?testeq@@ya?auvec4@@paueqstate@@aau1@@z" lfunc_begin0: ltmp0: # bb#0:                                 # %entry     movl    12(%esp), %ecx     movl    8(%esp), %edx     vmovaps (%edx), %xmm0     vmovaps 16(%edx), %xmm1     vmovaps (%ecx), %xmm2     vsubps  %xmm1, %xmm2, %xmm2     vmulps  %xmm0, %xmm2, %xmm2     vaddps  _vsa, %xmm2, %xmm2     vaddps  %xmm2, %xmm1, %xmm1     vmovaps %xmm1, 16(%edx)     vmovaps 32(%edx), %xmm2     vsubps  %xmm2, %xmm1, %xmm1     vmulps  %xmm0, %xmm1, %xmm1     vaddps  %xmm1, %xmm2, %xmm1     vmovaps %xmm1, 32(%edx)     vmovaps 48(%edx), %xmm2     vsubps  %xmm2, %xmm1, %xmm1     vmulps  %xmm0, %xmm1, %xmm1     vaddps  %xmm1, %xmm2, %xmm1     vmovaps %xmm1, 48(%edx)     vmovaps 64(%edx), %xmm2     vsubps  %xmm2, %xmm1, %xmm1     vmulps  %xmm0, %xmm1, %xmm0     vaddps  %xmm0, %xmm2, %xmm0     vmovaps %xmm0, 64(%edx)     vmovaps 80(%edx), %xmm1     vmovaps 96(%edx), %xmm2     vmovaps (%ecx), %xmm3     vsubps  %xmm2, %xmm3, %xmm3     vmulps  %xmm1, %xmm3, %xmm3     vaddps  _vsa, %xmm3, %xmm3     vaddps  %xmm3, %xmm2, %xmm2     vmovaps %xmm2, 96(%edx)     vmovaps 112(%edx), %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  %xmm1, %xmm2, %xmm2     vaddps  %xmm2, %xmm3, %xmm2     vmovaps %xmm2, 112(%edx)     vmovaps 128(%edx), %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  %xmm1, %xmm2, %xmm2     vaddps  %xmm2, %xmm3, %xmm2     vmovaps %xmm2, 128(%edx)     vmovaps 144(%edx), %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  %xmm1, %xmm2, %xmm1     vaddps  %xmm1, %xmm3, %xmm1     vmovaps %xmm1, 144(%edx)     vmovaps 192(%edx), %xmm2     vsubps  %xmm1, %xmm2, %xmm1     vaddps  %xmm1, %xmm0, %xmm3     vsubps  %xmm3, %xmm2, %xmm2     vmulps  208(%edx), %xmm0, %xmm0     vmulps  224(%edx), %xmm2, %xmm2     movl    4(%esp), %eax     vmulps  240(%edx), %xmm1, %xmm1     vmovaps 176(%edx), %xmm3     vmovaps %xmm3, 192(%edx)     vmovaps 160(%edx), %xmm3     vmovaps %xmm3, 176(%edx)     vmovaps (%ecx), %xmm3     vmovaps %xmm3, 160(%edx)     vaddps  %xmm2, %xmm0, %xmm0     vaddps  %xmm0, %xmm1, %xmm0     vmovaps %xmm0, (%eax)     retl lfunc_end0:

although number of instructions same, 1st version still 50% faster.

i tried identify cause of issue, without success. there suspicious things ugly vmovdqu instructions in 2nd msvc assembly. construction, copy assignment operator , pass-by-reference can unnecessarily move data sse registers memory, attempts solve or identify issue unsuccessful.

i don't think such simple wrapper cannot reach same performance bare __m128, whatever causes overhead eliminated.

so going on there?

as turned out problem not user defined struct vec4. related x86 calling conventions.

the default x86 calling convention in visual c++ __cdecl, which

pushes parameters on stack, in reverse order (right left)

now problem, since vec4 should kept , passed in xmm register. let's see happening.

1st case

in first case vec4 simple type alias of __m128.

using vec4 = __m128; /* ... */ vec4 testeq(eqstate* es, vec4 &sample) { ... }

the generated function header of testeq in assembly is

?testeq@@ya?at__m128@@paueqstate@@aat1@@z proc      ; testeq, comdat ; _es$ = ecx ; _sample$ = edx ...

nice.

2nd case

in second case vec4 not alias of __m128, user defined type now.

here investigate compilation both x86 , x64 platform.

x86 (32-bit compilation)

since __cdecl (which default calling convention in x86) doesn't allow pass aligned values functions (that emit error c2719: 'sample': formal parameter requested alignment of 16 won't aligned) pass const reference.

struct vec4{ __m128 simd; /* ... */ }; /* ... */ vec4 testeq(eqstate* es, const vec4 &sample) { ... }

which generates function header testeq as

?testeq@@ya?auvec4@@paueqstate@@abu1@@z proc        ; testeq, comdat ; ___$returnudt$ = ecx ; _es$ = edx     push    ebx     mov ebx, esp     sub esp, 8     , esp, -8                 ; fffffff8h     add esp, 4     push    ebp     mov ebp, dword ptr [ebx+4]     mov eax, dword ptr _sample$[ebx]     ...

this not simple 1 in 1st case. arguments moved stack. there additional mov instructions between first few sse instructions too, not listed here. these instructions in overall enough hit performance.

x64 (64-bit compilation)

windows in x64 use different calling convention part of x64 application binary interface (abi).

this convention tries keep data in registers if possible, in way floating-point data kept in xmm registers.

from msdn overview of x64 calling conventions:

the x64 application binary interface (abi) 4 register fast-call calling convention, stack-backing registers. there strict one-to-one correspondence between arguments in function, , registers arguments. argument doesn’t fit in 8 bytes, or not 1, 2, 4, or 8 bytes, must passed reference. (...) floating point operations done using 16 xmm registers. arguments passed in registers rcx, rdx, r8, , r9. if argumentsare float/double, passed in xmm0l, xmm1l, xmm2l, , xmm3l. 16 byte arguments passed reference.

from wikipedia page x86-64 calling conventions

the microsoft x64 calling convention followed on windows , pre-boot uefi (for long mode on x86-64). uses registers rcx, rdx, r8, r9 first 4 integer or pointer arguments (in order), , xmm0, xmm1, xmm2, xmm3 used floating point arguments. additional arguments pushed onto stack (right left). integer return values (similar x86) returned in rax if 64 bits or less. floating point return values returned in xmm0.

so second case in x64 mode generates function header testeq as

?testeq@@yq?auvec4@@paueqstate@@abu1@@z proc        ; testeq, comdat ; _es$ = ecx ; _sample$ = edx ...

this same 1st case!

solution

for x86 mode presented behavior should fixed.

the simple solution inline function. although hint , compiler can ignore, can tell compiler inline function. not desired because of function size or other reason.

fortunately microsoft introduced __vectorcall convention in visual studio 2013 , above (available in both x86 , x64 mode). similar default windows x64 calling convention, more utilizable registers.

let's rewrite 2nd case __vectorcall:

vec4 __vectorcall testeq(eqstate* es, const vec4 &sample) { ... }

now generated assembly function header testeq is

?testeq@@yq?auvec4@@paueqstate@@abu1@@z proc        ; testeq, comdat ; _es$ = ecx ; _sample$ = edx ...

which same 1st case , 2nd case in x64.

as peter cordes pointed out, take full advantage of __vectorcall, vec4 argument should passed value, instead of constant reference. passed type should meet requirements, must trivially copy constructible (no user defined copy constructors) , shouldn't contain union. more info in comments below , here.

final words

it looks msvc under hood automatically applies __vectorcall convention optimization when detects __m128 argument. otherwise uses default calling convention __cdecl (you can change behavior compiler options).

people told me in comments didn't see difference between gcc , clang generated assembly of 2 case. because these compilers optimization flag -o2 inline testeq function test loop body (see). possible more clever msvc , perform better optimization of function call.

Search This Blog

Addrety