c++ - SSE vector wrapper type performance compared to bare __m128 -
i found interesting gamasutra article simd pitfalls, states not possible reach performance of "pure" __m128
type wrapper types. skeptical, downloaded project files , fabricated comparable test case.
it turned out (for surprise) wrapper version slower. since don't want talk thin air, test cases following:
in 1st case vec4
simple alias of __m128
type operators:
#include <xmmintrin.h> #include <emmintrin.h> using vec4 = __m128; inline __m128 vload(float f) { return _mm_set_ps(f, f, f, f); }; inline vec4& operator+=(vec4 &va, vec4 vb) { return (va = _mm_add_ps(va, vb)); }; inline vec4& operator*=(vec4 &va, vec4 vb) { return (va = _mm_mul_ps(va, vb)); }; inline vec4 operator+(vec4 va, vec4 vb) { return _mm_add_ps(va, vb); }; inline vec4 operator-(vec4 va, vec4 vb) { return _mm_sub_ps(va, vb); }; inline vec4 operator*(vec4 va, vec4 vb) { return _mm_mul_ps(va, vb); };
in 2nd case vec4
lightweight wrapper around __m128
. not complete wrapper, short sketch covers issue. operators wrap same intrinsics, difference (since 16-byte alignment cannot applied on arguments) take vec4
const
reference:
#include <xmmintrin.h> #include <emmintrin.h> struct vec4 { __m128 simd; inline vec4() = default; inline vec4(const vec4&) = default; inline vec4& operator=(const vec4&) = default; inline vec4(__m128 s) : simd(s) {} inline operator __m128() const { return simd; } inline operator __m128&() { return simd; } }; inline __m128 vload(float f) { return _mm_set_ps(f, f, f, f); }; inline vec4 vadd(const vec4 &va, const vec4 &vb) { return _mm_add_ps(va, vb); // return _mm_add_ps(va.simd, vb.simd); // doesn't make difference }; inline vec4 vsub(const vec4 &va, const vec4 &vb) { return _mm_sub_ps(va, vb); // return _mm_sub_ps(va.simd, vb.simd); // doesn't make difference }; inline vec4 vmul(const vec4 &va, const vec4 &vb) { return _mm_mul_ps(va, vb); // return _mm_mul_ps(va.simd, vb.simd); // doesn't make difference };
and here test kernel produces different performance different versions of vec4
:
#include <xmmintrin.h> #include <emmintrin.h> struct eqstate { // filter #1 (low band) vec4 lf; // frequency vec4 f1p0; // poles ... vec4 f1p1; vec4 f1p2; vec4 f1p3; // filter #2 (high band) vec4 hf; // frequency vec4 f2p0; // poles ... vec4 f2p1; vec4 f2p2; vec4 f2p3; // sample history buffer vec4 sdm1; // sample data minus 1 vec4 sdm2; // 2 vec4 sdm3; // 3 // gain controls vec4 lg; // low gain vec4 mg; // mid gain vec4 hg; // high gain }; static float vsaf = (1.0f / 4294967295.0f); // small amount (denormal fix) static vec4 vsa = vload(vsaf); vec4 testeq(eqstate* es, vec4& sample) { // locals vec4 l,m,h; // low / mid / high - sample values // filter #1 (lowpass) es->f1p0 += (es->lf * (sample - es->f1p0)) + vsa; //es->f1p0 = vadd(es->f1p0, vadd(vmul(es->lf, vsub(sample, es->f1p0)), vsa)); es->f1p1 += (es->lf * (es->f1p0 - es->f1p1)); //es->f1p1 = vadd(es->f1p1, vmul(es->lf, vsub(es->f1p0, es->f1p1))); es->f1p2 += (es->lf * (es->f1p1 - es->f1p2)); //es->f1p2 = vadd(es->f1p2, vmul(es->lf, vsub(es->f1p1, es->f1p2))); es->f1p3 += (es->lf * (es->f1p2 - es->f1p3)); //es->f1p3 = vadd(es->f1p3, vmul(es->lf, vsub(es->f1p2, es->f1p3))); l = es->f1p3; // filter #2 (highpass) es->f2p0 += (es->hf * (sample - es->f2p0)) + vsa; //es->f2p0 = vadd(es->f2p0, vadd(vmul(es->hf, vsub(sample, es->f2p0)), vsa)); es->f2p1 += (es->hf * (es->f2p0 - es->f2p1)); //es->f2p1 = vadd(es->f2p1, vmul(es->hf, vsub(es->f2p0, es->f2p1))); es->f2p2 += (es->hf * (es->f2p1 - es->f2p2)); //es->f2p2 = vadd(es->f2p2, vmul(es->hf, vsub(es->f2p1, es->f2p2))); es->f2p3 += (es->hf * (es->f2p2 - es->f2p3)); //es->f2p3 = vadd(es->f2p3, vmul(es->hf, vsub(es->f2p2, es->f2p3))); h = es->sdm3 - es->f2p3; //h = vsub(es->sdm3, es->f2p3); // calculate midrange (signal - (low + high)) m = es->sdm3 - (h + l); //m = vsub(es->sdm3, vadd(h, l)); // scale, combine , store l *= es->lg; m *= es->mg; h *= es->hg; //l = vmul(l, es->lg); //m = vmul(m, es->mg); //h = vmul(h, es->hg); // shuffle history buffer es->sdm3 = es->sdm2; es->sdm2 = es->sdm1; es->sdm1 = sample; // return result return(l + m + h); //return(vadd(l, vadd(m, h))); } //make these globals enforce function call; static vec4 sample[1024], result[1024]; static eqstate es; #include <chrono> #include <iostream> int main() { auto t0 = std::chrono::high_resolution_clock::now(); (int ii=0; ii<1024; ii++) { result[ii] = testeq(&es, sample[ii]); } auto t1 = std::chrono::high_resolution_clock::now(); auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count(); std::cout << "timing: " << t << '\n'; std::cin.get(); return 0; }
link working code
msvc 2015 generated assembly 1st version:
; comdat ?testeq@@ya?at__m128@@paueqstate@@aat1@@z _text segment ?testeq@@ya?at__m128@@paueqstate@@aat1@@z proc ; testeq, comdat ; _es$dead$ = ecx ; _sample$ = edx vmovaps xmm0, xmmword ptr [edx] vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16 vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?vsa@@3t__m128@@a vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16 vmovaps xmmword ptr ?es@@3ueqstate@@a+16, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32 vmovaps xmmword ptr ?es@@3ueqstate@@a+32, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48 vmovaps xmmword ptr ?es@@3ueqstate@@a+48, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+64 vmulps xmm0, xmm0, xmm2 vaddps xmm4, xmm0, xmmword ptr ?es@@3ueqstate@@a+64 vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a+80 vmovaps xmm1, xmmword ptr ?es@@3ueqstate@@a+192 vmovaps xmmword ptr ?es@@3ueqstate@@a+64, xmm4 vmovaps xmm0, xmmword ptr [edx] vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?vsa@@3t__m128@@a vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96 vmovaps xmmword ptr ?es@@3ueqstate@@a+96, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112 vmovaps xmmword ptr ?es@@3ueqstate@@a+112, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128 vmovaps xmmword ptr ?es@@3ueqstate@@a+128, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144 vsubps xmm2, xmm1, xmm0 vmovaps xmmword ptr ?es@@3ueqstate@@a+144, xmm0 vmovaps xmm0, xmmword ptr ?es@@3ueqstate@@a+176 vmovaps xmmword ptr ?es@@3ueqstate@@a+192, xmm0 vmovaps xmm0, xmmword ptr ?es@@3ueqstate@@a+160 vmovaps xmmword ptr ?es@@3ueqstate@@a+176, xmm0 vmovaps xmm0, xmmword ptr [edx] vmovaps xmmword ptr ?es@@3ueqstate@@a+160, xmm0 vaddps xmm0, xmm4, xmm2 vsubps xmm0, xmm1, xmm0 vmulps xmm1, xmm0, xmmword ptr ?es@@3ueqstate@@a+224 vmulps xmm0, xmm2, xmmword ptr ?es@@3ueqstate@@a+240 vaddps xmm1, xmm1, xmm0 vmulps xmm0, xmm4, xmmword ptr ?es@@3ueqstate@@a+208 vaddps xmm0, xmm1, xmm0 ret 0 ?testeq@@ya?at__m128@@paueqstate@@aat1@@z endp ; testeq
msvc 2015 generated assembly 2nd version:
?testeq@@ya?auvec4@vmath@@paueqstate@@aau12@@z proc ; testeq, comdat ; ___$returnudt$ = ecx ; _es$dead$ = edx push ebx mov ebx, esp sub esp, 8 , esp, -8 ; fffffff8h add esp, 4 push ebp mov ebp, dword ptr [ebx+4] mov eax, dword ptr _sample$[ebx] vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a vmovaps xmm1, xmmword ptr ?es@@3ueqstate@@a+192 mov dword ptr [esp+4], ebp vmovaps xmm0, xmmword ptr [eax] vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?vsa@@3uvec4@vmath@@a vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+16 vmovaps xmmword ptr ?es@@3ueqstate@@a+16, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+32 vmovaps xmmword ptr ?es@@3ueqstate@@a+32, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+48 vmovaps xmmword ptr ?es@@3ueqstate@@a+48, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+64 vmulps xmm0, xmm0, xmm2 vaddps xmm4, xmm0, xmmword ptr ?es@@3ueqstate@@a+64 vmovaps xmm2, xmmword ptr ?es@@3ueqstate@@a+80 vmovaps xmmword ptr ?es@@3ueqstate@@a+64, xmm4 vmovaps xmm0, xmmword ptr [eax] vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?vsa@@3uvec4@vmath@@a vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+96 vmovaps xmmword ptr ?es@@3ueqstate@@a+96, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+112 vmovaps xmmword ptr ?es@@3ueqstate@@a+112, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+128 vmovaps xmmword ptr ?es@@3ueqstate@@a+128, xmm0 vsubps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144 vmulps xmm0, xmm0, xmm2 vaddps xmm0, xmm0, xmmword ptr ?es@@3ueqstate@@a+144 vsubps xmm2, xmm1, xmm0 vmovaps xmmword ptr ?es@@3ueqstate@@a+144, xmm0 vaddps xmm0, xmm2, xmm4 vsubps xmm0, xmm1, xmm0 vmulps xmm1, xmm0, xmmword ptr ?es@@3ueqstate@@a+224 vmovdqu xmm0, xmmword ptr ?es@@3ueqstate@@a+176 vmovdqu xmmword ptr ?es@@3ueqstate@@a+192, xmm0 vmovdqu xmm0, xmmword ptr ?es@@3ueqstate@@a+160 vmovdqu xmmword ptr ?es@@3ueqstate@@a+176, xmm0 vmovdqu xmm0, xmmword ptr [eax] vmovdqu xmmword ptr ?es@@3ueqstate@@a+160, xmm0 vmulps xmm0, xmm4, xmmword ptr ?es@@3ueqstate@@a+208 vaddps xmm1, xmm0, xmm1 vmulps xmm0, xmm2, xmmword ptr ?es@@3ueqstate@@a+240 vaddps xmm0, xmm1, xmm0 vmovaps xmmword ptr [ecx], xmm0 mov eax, ecx pop ebp mov esp, ebx pop ebx ret 0 ?testeq@@ya?auvec4@vmath@@paueqstate@@aau12@@z endp ; testeq
the produced assembly of 2nd version longer , slower. not strictly related visual studio, since clang 3.8 produces similar performance results.
clang 3.8 generated assembly 1st version:
"?testeq@@yat__m128@@paueqstate@@aat1@@z": # @"\01?testeq@@yat__m128@@paueqstate@@aat1@@z" lfunc_begin0: ltmp0: # bb#0: # %entry movl 8(%esp), %eax movl 4(%esp), %ecx vmovaps _vsa, %xmm0 vmovaps (%ecx), %xmm1 vmovaps 16(%ecx), %xmm2 vmovaps (%eax), %xmm3 vsubps %xmm2, %xmm3, %xmm3 vmulps %xmm3, %xmm1, %xmm3 vaddps %xmm3, %xmm0, %xmm3 vaddps %xmm3, %xmm2, %xmm2 vmovaps %xmm2, 16(%ecx) vmovaps 32(%ecx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm2, %xmm1, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 32(%ecx) vmovaps 48(%ecx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm2, %xmm1, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 48(%ecx) vmovaps 64(%ecx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm2, %xmm1, %xmm1 vaddps %xmm1, %xmm3, %xmm1 vmovaps %xmm1, 64(%ecx) vmovaps 80(%ecx), %xmm2 vmovaps 96(%ecx), %xmm3 vmovaps (%eax), %xmm4 vsubps %xmm3, %xmm4, %xmm4 vmulps %xmm4, %xmm2, %xmm4 vaddps %xmm4, %xmm0, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 96(%ecx) vmovaps 112(%ecx), %xmm3 vsubps %xmm3, %xmm0, %xmm0 vmulps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 112(%ecx) vmovaps 128(%ecx), %xmm3 vsubps %xmm3, %xmm0, %xmm0 vmulps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 128(%ecx) vmovaps 144(%ecx), %xmm3 vsubps %xmm3, %xmm0, %xmm0 vmulps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm3, %xmm0 vmovaps %xmm0, 144(%ecx) vmovaps 192(%ecx), %xmm2 vsubps %xmm0, %xmm2, %xmm0 vaddps %xmm0, %xmm1, %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps 208(%ecx), %xmm1, %xmm1 vmulps 224(%ecx), %xmm2, %xmm2 vmulps 240(%ecx), %xmm0, %xmm0 vmovaps 176(%ecx), %xmm3 vmovaps %xmm3, 192(%ecx) vmovaps 160(%ecx), %xmm3 vmovaps %xmm3, 176(%ecx) vmovaps (%eax), %xmm3 vmovaps %xmm3, 160(%ecx) vaddps %xmm2, %xmm0, %xmm0 vaddps %xmm0, %xmm1, %xmm0 retl lfunc_end0:
clang 3.8 generated assembly 2nd version:
"?testeq@@ya?auvec4@@paueqstate@@aau1@@z": # @"\01?testeq@@ya?auvec4@@paueqstate@@aau1@@z" lfunc_begin0: ltmp0: # bb#0: # %entry movl 12(%esp), %ecx movl 8(%esp), %edx vmovaps (%edx), %xmm0 vmovaps 16(%edx), %xmm1 vmovaps (%ecx), %xmm2 vsubps %xmm1, %xmm2, %xmm2 vmulps %xmm0, %xmm2, %xmm2 vaddps _vsa, %xmm2, %xmm2 vaddps %xmm2, %xmm1, %xmm1 vmovaps %xmm1, 16(%edx) vmovaps 32(%edx), %xmm2 vsubps %xmm2, %xmm1, %xmm1 vmulps %xmm0, %xmm1, %xmm1 vaddps %xmm1, %xmm2, %xmm1 vmovaps %xmm1, 32(%edx) vmovaps 48(%edx), %xmm2 vsubps %xmm2, %xmm1, %xmm1 vmulps %xmm0, %xmm1, %xmm1 vaddps %xmm1, %xmm2, %xmm1 vmovaps %xmm1, 48(%edx) vmovaps 64(%edx), %xmm2 vsubps %xmm2, %xmm1, %xmm1 vmulps %xmm0, %xmm1, %xmm0 vaddps %xmm0, %xmm2, %xmm0 vmovaps %xmm0, 64(%edx) vmovaps 80(%edx), %xmm1 vmovaps 96(%edx), %xmm2 vmovaps (%ecx), %xmm3 vsubps %xmm2, %xmm3, %xmm3 vmulps %xmm1, %xmm3, %xmm3 vaddps _vsa, %xmm3, %xmm3 vaddps %xmm3, %xmm2, %xmm2 vmovaps %xmm2, 96(%edx) vmovaps 112(%edx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm1, %xmm2, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 112(%edx) vmovaps 128(%edx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm1, %xmm2, %xmm2 vaddps %xmm2, %xmm3, %xmm2 vmovaps %xmm2, 128(%edx) vmovaps 144(%edx), %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps %xmm1, %xmm2, %xmm1 vaddps %xmm1, %xmm3, %xmm1 vmovaps %xmm1, 144(%edx) vmovaps 192(%edx), %xmm2 vsubps %xmm1, %xmm2, %xmm1 vaddps %xmm1, %xmm0, %xmm3 vsubps %xmm3, %xmm2, %xmm2 vmulps 208(%edx), %xmm0, %xmm0 vmulps 224(%edx), %xmm2, %xmm2 movl 4(%esp), %eax vmulps 240(%edx), %xmm1, %xmm1 vmovaps 176(%edx), %xmm3 vmovaps %xmm3, 192(%edx) vmovaps 160(%edx), %xmm3 vmovaps %xmm3, 176(%edx) vmovaps (%ecx), %xmm3 vmovaps %xmm3, 160(%edx) vaddps %xmm2, %xmm0, %xmm0 vaddps %xmm0, %xmm1, %xmm0 vmovaps %xmm0, (%eax) retl lfunc_end0:
although number of instructions same, 1st version still 50% faster.
i tried identify cause of issue, without success. there suspicious things ugly vmovdqu
instructions in 2nd msvc assembly. construction, copy assignment operator , pass-by-reference can unnecessarily move data sse registers memory, attempts solve or identify issue unsuccessful.
i don't think such simple wrapper cannot reach same performance bare __m128
, whatever causes overhead eliminated.
so going on there?
as turned out problem not user defined struct vec4
. related x86 calling conventions.
the default x86 calling convention in visual c++ __cdecl
, which
pushes parameters on stack, in reverse order (right left)
now problem, since vec4
should kept , passed in xmm register. let's see happening.
1st case
in first case vec4
simple type alias of __m128
.
using vec4 = __m128; /* ... */ vec4 testeq(eqstate* es, vec4 &sample) { ... }
the generated function header of testeq
in assembly is
?testeq@@ya?at__m128@@paueqstate@@aat1@@z proc ; testeq, comdat ; _es$ = ecx ; _sample$ = edx ...
nice.
2nd case
in second case vec4
not alias of __m128
, user defined type now.
here investigate compilation both x86 , x64 platform.
x86 (32-bit compilation)
since __cdecl
(which default calling convention in x86) doesn't allow pass aligned values functions (that emit error c2719: 'sample': formal parameter requested alignment of 16 won't aligned
) pass const
reference.
struct vec4{ __m128 simd; /* ... */ }; /* ... */ vec4 testeq(eqstate* es, const vec4 &sample) { ... }
which generates function header testeq
as
?testeq@@ya?auvec4@@paueqstate@@abu1@@z proc ; testeq, comdat ; ___$returnudt$ = ecx ; _es$ = edx push ebx mov ebx, esp sub esp, 8 , esp, -8 ; fffffff8h add esp, 4 push ebp mov ebp, dword ptr [ebx+4] mov eax, dword ptr _sample$[ebx] ...
this not simple 1 in 1st case. arguments moved stack. there additional mov
instructions between first few sse instructions too, not listed here. these instructions in overall enough hit performance.
x64 (64-bit compilation)
windows in x64 use different calling convention part of x64 application binary interface (abi).
this convention tries keep data in registers if possible, in way floating-point data kept in xmm registers.
from msdn overview of x64 calling conventions:
the x64 application binary interface (abi) 4 register fast-call calling convention, stack-backing registers. there strict one-to-one correspondence between arguments in function, , registers arguments. argument doesn’t fit in 8 bytes, or not 1, 2, 4, or 8 bytes, must passed reference. (...) floating point operations done using 16 xmm registers. arguments passed in registers rcx, rdx, r8, , r9. if argumentsare float/double, passed in xmm0l, xmm1l, xmm2l, , xmm3l. 16 byte arguments passed reference.
from wikipedia page x86-64 calling conventions
the microsoft x64 calling convention followed on windows , pre-boot uefi (for long mode on x86-64). uses registers rcx, rdx, r8, r9 first 4 integer or pointer arguments (in order), , xmm0, xmm1, xmm2, xmm3 used floating point arguments. additional arguments pushed onto stack (right left). integer return values (similar x86) returned in rax if 64 bits or less. floating point return values returned in xmm0.
so second case in x64 mode generates function header testeq
as
?testeq@@yq?auvec4@@paueqstate@@abu1@@z proc ; testeq, comdat ; _es$ = ecx ; _sample$ = edx ...
this same 1st case!
solution
for x86 mode presented behavior should fixed.
the simple solution inline
function. although hint , compiler can ignore, can tell compiler inline function. not desired because of function size or other reason.
fortunately microsoft introduced __vectorcall
convention in visual studio 2013 , above (available in both x86 , x64 mode). similar default windows x64 calling convention, more utilizable registers.
let's rewrite 2nd case __vectorcall
:
vec4 __vectorcall testeq(eqstate* es, const vec4 &sample) { ... }
now generated assembly function header testeq
is
?testeq@@yq?auvec4@@paueqstate@@abu1@@z proc ; testeq, comdat ; _es$ = ecx ; _sample$ = edx ...
which same 1st case , 2nd case in x64.
as peter cordes pointed out, take full advantage of __vectorcall
, vec4
argument should passed value, instead of constant reference. passed type should meet requirements, must trivially copy constructible (no user defined copy constructors) , shouldn't contain union. more info in comments below , here.
final words
it looks msvc under hood automatically applies __vectorcall
convention optimization when detects __m128
argument. otherwise uses default calling convention __cdecl
(you can change behavior compiler options).
people told me in comments didn't see difference between gcc , clang generated assembly of 2 case. because these compilers optimization flag -o2
inline testeq
function test loop body (see). possible more clever msvc , perform better optimization of function call.
Comments
Post a Comment