37 #include <xmmintrin.h>
38 #include <smmintrin.h>
39 #include <emmintrin.h>
40 #if defined(__FMA__) || defined(__AVX2__)
41 #include <immintrin.h>
47 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
57 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_loadu_ps( (AkReal32*)(__addr__) )
61 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
65 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
69 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
72 #define AKSIMD_SETV_V2F64( _b, _a ) _mm_castpd_ps(_mm_set_pd( (_b), (_a) ))
75 #define AKSIMD_SETV_V4F32( _d, _c, _b, _a ) _mm_set_ps( (_d), (_c), (_b), (_a) )
80 __m128i temp = _mm_set_epi32(8, 4, 2, 1);
81 __m128i xvec = _mm_set1_epi32(x);
82 __m128i xand = _mm_and_si128(xvec, temp);
83 return _mm_castsi128_ps(_mm_cmpeq_epi32(temp, xand));
88 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
93 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
105 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
109 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
113 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
117 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) _mm_store_sd( (AkReal64*)(__addr__), _mm_castps_pd(__vec__) )
127 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
132 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
134 #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
141 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
148 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
151 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
154 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
157 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
160 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
163 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
174 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
179 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
183 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
188 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
192 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
194 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
200 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
203 #if defined(__FMA__) || defined(__AVX2__)
204 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_fmadd_ps( (__a__), (__b__) , (__c__) )
205 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_fmsub_ps( (__a__), (__b__) , (__c__) )
207 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
208 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
212 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
216 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
220 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
223 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
226 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
229 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
232 #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
235 #define AKSIMD_RECIP_V4F32(__a__) _mm_rcp_ps(__a__)
238 #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
243 static const AKSIMD_V4F32 vEpsilon = { 0.49999f, 0.49999f, 0.49999f, 0.49999f };
244 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, vEpsilon)));
253 __m128 vAb = _mm_shuffle_ps(vVec, vVec, 0xB1);
254 __m128 vHaddAb = _mm_add_ps(vVec, vAb);
255 __m128 vHaddCd = _mm_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
256 __m128 vHaddAbcd = _mm_add_ps(vHaddAb, vHaddCd);
269 static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
281 #include <pmmintrin.h>
301 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_addsub_ps( a, b)
307 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_add_ps( a, _mm_xor_ps(b, AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f)))
311 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
312 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
313 #elif defined(AK_CPU_X86) || defined(AK_CPU_X86_64)
314 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
316 #define AKSIMD_ASSERTFLUSHZEROMODE
328 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
330 #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
331 #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
332 #define AKSIMD_OR_V4I32( a, b ) _mm_or_si128(a,b)
333 #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
334 #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
335 #define AKSIMD_NOT_V4I32( a ) _mm_xor_si128(a,_mm_set1_epi32(~0))
337 #define AKSIMD_OR_V4F32( a, b ) _mm_or_ps(a,b)
338 #define AKSIMD_AND_V4F32( a, b ) _mm_and_ps(a,b)
339 #define AKSIMD_ANDNOT_V4F32( a, b ) _mm_andnot_ps(a,b)
340 #define AKSIMD_NOT_V4F32( a ) _mm_xor_ps(a,_mm_castsi128_ps(_mm_set1_epi32(~0)))
342 #define AKSIMD_OR_V4COND( a, b ) _mm_or_ps(a,b)
343 #define AKSIMD_AND_V4COND( a, b ) _mm_and_ps(a,b)
346 #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
351 #ifdef __SSE4_1__ // use SSE 4.1 version directly where possible
352 return _mm_mullo_epi32(vIn1, vIn2);
353 #else // use SSE 2 otherwise
354 __m128i tmp1 = _mm_mul_epu32(vIn1, vIn2);
355 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(vIn1, 4), _mm_srli_si128(vIn2, 4));
356 return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
370 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
374 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
388 _mm_set1_epi32(*(
AkInt32*)addr0),
389 _mm_set1_epi32(*(
AkInt32*)addr1),
390 _mm_set1_epi32(*(
AkInt32*)addr2),
391 _mm_set1_epi32(*(
AkInt32*)addr3),
395 _mm_unpacklo_epi32(data[0], data[1]),
396 _mm_unpacklo_epi32(data[2], data[3]),
399 __m128i shuffle = _mm_unpacklo_epi64(group[0], group[1]);
402 _mm_srai_epi32(_mm_slli_epi32(shuffle, 16), 16),
403 _mm_srai_epi32(shuffle, 16)
423 _mm_set1_epi64x(*(
AkInt64*)addr0),
424 _mm_set1_epi64x(*(
AkInt64*)addr1),
425 _mm_set1_epi64x(*(
AkInt64*)addr2),
426 _mm_set1_epi64x(*(
AkInt64*)addr3),
430 _mm_unpacklo_epi64(data[0], data[1]),
431 _mm_unpacklo_epi64(data[2], data[3]),
434 __m128i shuffle[2] = {
435 _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
436 _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
440 _mm_srai_epi32(_mm_slli_epi32(shuffle[0],16),16),
441 _mm_srai_epi32(shuffle[0],16),
442 _mm_srai_epi32(_mm_slli_epi32(shuffle[1],16),16),
443 _mm_srai_epi32(shuffle[1],16),
456 #define AKSIMD_CMP_CTRLMASK __m128
459 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
461 #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
464 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
466 #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
469 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
474 #if defined(__SSE4_1__)
475 return _mm_blendv_ps(vA, vB, vMask);
477 vB = _mm_and_ps( vB, vMask );
478 vA= _mm_andnot_ps( vMask, vA );
479 return _mm_or_ps( vA, vB );
484 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
487 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
489 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
491 #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
496 return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_setzero_si128())) == 0xFFFF;
498 #define AKSIMD_TESTZERO_V4F32( __a__ ) AKSIMD_TESTZERO_V4I32(_mm_castps_si128(__a__))
499 #define AKSIMD_TESTZERO_V4COND( __a__ ) AKSIMD_TESTZERO_V4F32(__a__)
504 return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_set1_epi32(~0))) == 0xFFFF;
506 #define AKSIMD_TESTONES_V4F32( __a__ ) AKSIMD_TESTONES_V4I32(_mm_castps_si128(__a__))
507 #define AKSIMD_TESTONES_V4COND( __a__ ) AKSIMD_TESTONES_V4F32(__a__)
513 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
516 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
519 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
521 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
523 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
525 #define AKSIMD_SETV_V2I64( _b, _a ) _mm_set_epi64x( (_b), (_a) )
528 #define AKSIMD_INSERT_V4I32( a, i, index) _mm_insert_epi32(a, i, index)
531 #define AKSIMD_INSERT_V2I64( a, i, index) _mm_insert_epi64(a, i, index)
535 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
539 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
547 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
551 #define AKSIMD_ROUND_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
555 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
559 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
563 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
567 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
571 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
575 __m128i expMantData = _mm_and_si128(vec, _mm_set1_epi32(0x7fff0000));
576 __m128i expMantShifted = _mm_srli_epi32(expMantData, 3);
579 __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
582 __m128i infnanCheck = _mm_cmpgt_epi32(expMantData, _mm_set1_epi32(0x7bffffff));
583 __m128i infnanExp = _mm_and_si128(infnanCheck, _mm_set1_epi32(255 << 23));
584 __m128i expMantWithInfNan = _mm_or_si128(expMantFloat, infnanExp);
587 __m128i signData = _mm_and_si128(vec, _mm_set1_epi32(0x80000000));
588 __m128 assembledFloat = _mm_castsi128_ps(_mm_or_si128(signData, expMantWithInfNan));
589 return assembledFloat;
596 __m128i signData = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x80000000));
597 __m128i unsignedVec = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x7fffffff));
601 __m128 denormedVec = _mm_add_ps(_mm_castsi128_ps(unsignedVec), _mm_set1_ps(0.5f));
602 __m128i denormResult = _mm_slli_epi32(_mm_castps_si128(denormedVec), 16);
605 __m128i subnormMagic = _mm_set1_epi32(0xC8000FFF);
606 __m128i normRoundPart1 = _mm_add_epi32(unsignedVec, subnormMagic);
607 __m128i mantLsb = _mm_slli_epi32(unsignedVec, 31 - 13);
608 __m128i mantSignExtendLsb = _mm_srai_epi32(mantLsb, 31);
609 __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb);
610 __m128i normResult = _mm_slli_epi32(normRoundPart2, 3);
613 __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23);
614 __m128i denormMask = _mm_cmpgt_epi32(normalMinimum, unsignedVec);
616 __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
619 __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000));
620 __m128i mantissaData = _mm_and_si128(unsignedVec, _mm_set1_epi32(0x007fffff));
621 __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000));
622 __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000));
623 __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000));
624 __m128i infNanFloat = _mm_or_si128(infData, nantissaBit);
626 __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
629 __m128i signedResult = _mm_or_si128(signData, resultWithInfNan);
632 __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD);
633 __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD);
634 __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4);
635 __m128i result = _mm_castps_si128(_mm_shuffle_ps(resultEpi16, _mm_setzero_ps(), 0x8));
649 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) _mm_castpd_ps(__vec__)
653 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) _mm_castpd_si128(__vec__)
657 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) _mm_castps_pd(__vec__)
661 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
665 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) _mm_castsi128_pd(__vec__)
669 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) _mm_castsi128_ps(__vec__)
672 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) (__vec__)
675 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) (__vec__)
678 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
681 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) _mm_castsi128_ps(__vec__)
688 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
692 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
696 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
704 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
705 _mm_slli_epi32( (__vec__), (__shiftBy__) )
709 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
710 _mm_srli_epi32( (__vec__), (__shiftBy__) )
714 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
715 _mm_srai_epi32( (__vec__), (__shiftBy__) )
720 #if defined( AK_CPU_X86 )
724 #define AKSIMD_SETZERO_V2F32() _mm_setzero_si64()
726 #define AKSIMD_CMPGT_V2I32( a, b ) _mm_cmpgt_pi16(a,b)
730 #define AKSIMD_UNPACKLO_VECTOR4I16( a, b ) _mm_unpacklo_pi16( a, b )
734 #define AKSIMD_UNPACKHI_VECTOR4I16( a, b ) _mm_unpackhi_pi16( a, b )
738 #define AKSIMD_SHIFTLEFT_V2I32( __vec__, __shiftBy__ ) \
739 _mm_slli_pi32( (__vec__), (__shiftBy__) )
743 #define AKSIMD_SHIFTRIGHTARITH_V2I32( __vec__, __shiftBy__ ) \
744 _mm_srai_pi32( (__vec__), (__shiftBy__) )
748 #define AKSIMD_MMX_EMPTY _mm_empty()