32 #ifndef _AK_SIMD_SSE_H_
33 #define _AK_SIMD_SSE_H_
36 #include <xmmintrin.h>
37 #include <smmintrin.h>
38 #include <emmintrin.h>
39 #if defined(__FMA__) || defined(__AVX2__)
40 #include <immintrin.h>
47 #define AKSIMD_ARCHCACHELINESIZE (64)
48 #define AKSIMD_ARCHMAXPREFETCHSIZE (512)
50 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
58 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
93 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_loadu_ps( (AkReal32*)(__addr__) )
97 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
101 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
105 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
108 #define AKSIMD_SETV_V2F64( _b, _a ) _mm_castpd_ps(_mm_set_pd( (_b), (_a) ))
111 #define AKSIMD_SETV_V4F32( _d, _c, _b, _a ) _mm_set_ps( (_d), (_c), (_b), (_a) )
116 __m128i temp = _mm_set_epi32(8, 4, 2, 1);
117 __m128i xvec = _mm_set1_epi32(x);
118 __m128i xand = _mm_and_si128(xvec, temp);
119 return _mm_castsi128_ps(_mm_cmpeq_epi32(temp, xand));
124 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
129 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
141 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
145 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
149 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
153 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) _mm_store_sd( (AkReal64*)(__addr__), _mm_castps_pd(__vec__) )
163 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
168 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
170 #define AKSIMD_SHUFFLE_V4I32( a, b, i ) _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), i ))
177 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
184 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
187 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
190 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
193 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
196 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
199 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
210 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
215 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
219 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
224 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
228 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
230 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
236 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
239 #if defined(__FMA__) || defined(__AVX2__)
240 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_fmadd_ps( (__a__), (__b__) , (__c__) )
241 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_fmsub_ps( (__a__), (__b__) , (__c__) )
243 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
244 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
248 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
252 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
256 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
259 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
262 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
265 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
268 #define AKSIMD_RSQRT_V4F32( __a__ ) _mm_rsqrt_ps( (__a__) )
271 #define AKSIMD_RECIP_V4F32(__a__) _mm_rcp_ps(__a__)
274 #define AKSIMD_XOR_V4F32( a, b ) _mm_xor_ps(a,b)
279 static const AKSIMD_V4F32 vEpsilon = { 0.49999f, 0.49999f, 0.49999f, 0.49999f };
280 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, vEpsilon)));
289 __m128 vAb = _mm_shuffle_ps(vVec, vVec, 0xB1);
290 __m128 vHaddAb = _mm_add_ps(vVec, vAb);
291 __m128 vHaddCd = _mm_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
292 __m128 vHaddAbcd = _mm_add_ps(vHaddAb, vHaddCd);
305 static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
317 #include <pmmintrin.h>
337 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_addsub_ps( a, b)
343 #define AKSIMD_ADDSUB_V4F32( a, b ) _mm_add_ps( a, _mm_xor_ps(b, AKSIMD_SETV_V4F32(0.f, -0.f, 0.f, -0.f)))
347 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
348 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
349 #elif defined(AK_CPU_X86) || defined(AK_CPU_X86_64)
350 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
352 #define AKSIMD_ASSERTFLUSHZEROMODE
364 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
366 #define AKSIMD_CMPLT_V4I32( a, b ) _mm_cmplt_epi32(a,b)
367 #define AKSIMD_CMPGT_V4I32( a, b ) _mm_cmpgt_epi32(a,b)
368 #define AKSIMD_OR_V4I32( a, b ) _mm_or_si128(a,b)
369 #define AKSIMD_XOR_V4I32( a, b ) _mm_xor_si128(a,b)
370 #define AKSIMD_SUB_V4I32( a, b ) _mm_sub_epi32(a,b)
371 #define AKSIMD_NOT_V4I32( a ) _mm_xor_si128(a,_mm_set1_epi32(~0))
373 #define AKSIMD_OR_V4F32( a, b ) _mm_or_ps(a,b)
374 #define AKSIMD_AND_V4F32( a, b ) _mm_and_ps(a,b)
375 #define AKSIMD_ANDNOT_V4F32( a, b ) _mm_andnot_ps(a,b)
376 #define AKSIMD_NOT_V4F32( a ) _mm_xor_ps(a,_mm_castsi128_ps(_mm_set1_epi32(~0)))
378 #define AKSIMD_OR_V4COND( a, b ) _mm_or_ps(a,b)
379 #define AKSIMD_AND_V4COND( a, b ) _mm_and_ps(a,b)
382 #define AKSIMD_MULLO16_V4I32( a , b) _mm_mullo_epi16(a, b)
387 #ifdef __SSE4_1__ // use SSE 4.1 version directly where possible
388 return _mm_mullo_epi32(vIn1, vIn2);
389 #else // use SSE 2 otherwise
390 __m128i tmp1 = _mm_mul_epu32(vIn1, vIn2);
391 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(vIn1, 4), _mm_srli_si128(vIn2, 4));
392 return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
406 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
410 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
424 _mm_set1_epi32(*(
AkInt32*)addr0),
425 _mm_set1_epi32(*(
AkInt32*)addr1),
426 _mm_set1_epi32(*(
AkInt32*)addr2),
427 _mm_set1_epi32(*(
AkInt32*)addr3),
431 _mm_unpacklo_epi32(data[0], data[1]),
432 _mm_unpacklo_epi32(data[2], data[3]),
435 __m128i shuffle = _mm_unpacklo_epi64(group[0], group[1]);
438 _mm_srai_epi32(_mm_slli_epi32(shuffle, 16), 16),
439 _mm_srai_epi32(shuffle, 16)
459 _mm_set1_epi64x(*(
AkInt64*)addr0),
460 _mm_set1_epi64x(*(
AkInt64*)addr1),
461 _mm_set1_epi64x(*(
AkInt64*)addr2),
462 _mm_set1_epi64x(*(
AkInt64*)addr3),
466 _mm_unpacklo_epi64(data[0], data[1]),
467 _mm_unpacklo_epi64(data[2], data[3]),
470 __m128i shuffle[2] = {
471 _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0x88)),
472 _mm_castps_si128 (_mm_shuffle_ps(_mm_castsi128_ps(group[0]), _mm_castsi128_ps(group[1]), 0xDD)),
476 _mm_srai_epi32(_mm_slli_epi32(shuffle[0],16),16),
477 _mm_srai_epi32(shuffle[0],16),
478 _mm_srai_epi32(_mm_slli_epi32(shuffle[1],16),16),
479 _mm_srai_epi32(shuffle[1],16),
492 #define AKSIMD_CMP_CTRLMASK __m128
495 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
497 #define AKSIMD_LT_V4F32( __a__, __b__ ) _mm_cmplt_ps( (__a__), (__b__) )
500 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
502 #define AKSIMD_GT_V4F32( __a__, __b__ ) _mm_cmpgt_ps( (__a__), (__b__) )
505 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
510 vB = _mm_and_ps( vB, vMask );
511 vA= _mm_andnot_ps( vMask, vA );
512 return _mm_or_ps( vA, vB );
516 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
519 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
521 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
523 #define AKSIMD_MASK_V4F32( __a__ ) _mm_movemask_ps( __a__ )
528 return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_setzero_si128())) == 0xFFFF;
530 #define AKSIMD_TESTZERO_V4F32( __a__ ) AKSIMD_TESTZERO_V4I32(_mm_castps_si128(__a__))
531 #define AKSIMD_TESTZERO_V4COND( __a__ ) AKSIMD_TESTZERO_V4F32(__a__)
536 return _mm_movemask_epi8(_mm_cmpeq_epi32(a, _mm_set1_epi32(~0))) == 0xFFFF;
538 #define AKSIMD_TESTONES_V4F32( __a__ ) AKSIMD_TESTONES_V4I32(_mm_castps_si128(__a__))
539 #define AKSIMD_TESTONES_V4COND( __a__ ) AKSIMD_TESTONES_V4F32(__a__)
545 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
548 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
551 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
553 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
555 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
557 #define AKSIMD_SETV_V2I64( _b, _a ) _mm_set_epi64x( (_b), (_a) )
560 #define AKSIMD_INSERT_V4I32( a, i, index) _mm_insert_epi32(a, i, index)
563 #define AKSIMD_INSERT_V2I64( a, i, index) _mm_insert_epi64(a, i, index)
567 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
571 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__m128i*)(__addr__), (__vec__) )
579 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
583 #define AKSIMD_ROUND_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
587 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
591 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
595 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
599 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpacklo_epi16(_mm_setzero_si128(), __vec__))
603 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( _mm_unpackhi_epi16(_mm_setzero_si128(), __vec__))
607 __m128i expMantData = _mm_and_si128(vec, _mm_set1_epi32(0x7fff0000));
608 __m128i expMantShifted = _mm_srli_epi32(expMantData, 3);
611 __m128i expMantFloat = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(expMantShifted), _mm_castsi128_ps(_mm_set1_epi32(0x77800000))));
614 __m128i infnanCheck = _mm_cmpgt_epi32(expMantData, _mm_set1_epi32(0x7bffffff));
615 __m128i infnanExp = _mm_and_si128(infnanCheck, _mm_set1_epi32(255 << 23));
616 __m128i expMantWithInfNan = _mm_or_si128(expMantFloat, infnanExp);
619 __m128i signData = _mm_and_si128(vec, _mm_set1_epi32(0x80000000));
620 __m128 assembledFloat = _mm_castsi128_ps(_mm_or_si128(signData, expMantWithInfNan));
621 return assembledFloat;
628 __m128i signData = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x80000000));
629 __m128i unsignedVec = _mm_and_si128(_mm_castps_si128(vec), _mm_set1_epi32(0x7fffffff));
633 __m128 denormedVec = _mm_add_ps(_mm_castsi128_ps(unsignedVec), _mm_set1_ps(0.5f));
634 __m128i denormResult = _mm_slli_epi32(_mm_castps_si128(denormedVec), 16);
637 __m128i subnormMagic = _mm_set1_epi32(0xC8000FFF);
638 __m128i normRoundPart1 = _mm_add_epi32(unsignedVec, subnormMagic);
639 __m128i mantLsb = _mm_slli_epi32(unsignedVec, 31 - 13);
640 __m128i mantSignExtendLsb = _mm_srai_epi32(mantLsb, 31);
641 __m128i normRoundPart2 = _mm_sub_epi32(normRoundPart1, mantSignExtendLsb);
642 __m128i normResult = _mm_slli_epi32(normRoundPart2, 3);
645 __m128i normalMinimum = _mm_set1_epi32((127 - 14) << 23);
646 __m128i denormMask = _mm_cmpgt_epi32(normalMinimum, unsignedVec);
648 __m128i nonNanFloat = _mm_or_si128(_mm_and_si128(denormMask, denormResult), _mm_andnot_si128(denormMask, normResult));
651 __m128i isNotInfNanMask = _mm_cmplt_epi32(unsignedVec, _mm_set1_epi32(0x47800000));
652 __m128i mantissaData = _mm_and_si128(unsignedVec, _mm_set1_epi32(0x007fffff));
653 __m128i isNanMask = _mm_cmpgt_epi32(unsignedVec, _mm_set1_epi32(0x7F800000));
654 __m128i nantissaBit = _mm_and_si128(isNanMask, _mm_set1_epi32(0x02000000));
655 __m128i infData = _mm_andnot_si128(mantissaData, _mm_set1_epi32(0x7c000000));
656 __m128i infNanFloat = _mm_or_si128(infData, nantissaBit);
658 __m128i resultWithInfNan = _mm_or_si128(_mm_and_si128(isNotInfNanMask, nonNanFloat), _mm_andnot_si128(isNotInfNanMask, infNanFloat));
661 __m128i signedResult = _mm_or_si128(signData, resultWithInfNan);
664 __m128i resultEpi16Lo = _mm_shufflelo_epi16(signedResult, 0xD);
665 __m128i resultEpi16Hi = _mm_shufflehi_epi16(signedResult, 0xD);
666 __m128 resultEpi16 = _mm_shuffle_ps(_mm_castsi128_ps(resultEpi16Lo), _mm_castsi128_ps(resultEpi16Hi), 0xE4);
667 __m128i result = _mm_castps_si128(_mm_shuffle_ps(resultEpi16, _mm_setzero_ps(), 0x8));
681 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) _mm_castpd_ps(__vec__)
685 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) _mm_castpd_si128(__vec__)
689 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) _mm_castps_pd(__vec__)
693 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
697 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) _mm_castsi128_pd(__vec__)
701 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) _mm_castsi128_ps(__vec__)
704 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) (__vec__)
707 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) (__vec__)
710 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) _mm_castps_si128(__vec__)
713 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) _mm_castsi128_ps(__vec__)
720 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
724 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
728 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
736 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
737 _mm_slli_epi32( (__vec__), (__shiftBy__) )
741 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
742 _mm_srli_epi32( (__vec__), (__shiftBy__) )
746 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
747 _mm_srai_epi32( (__vec__), (__shiftBy__) )
752 #if defined( AK_CPU_X86 )
756 #define AKSIMD_SETZERO_V2F32() _mm_setzero_si64();
758 #define AKSIMD_CMPGT_V2I32( a, b ) _mm_cmpgt_pi16(a,b)
762 #define AKSIMD_UNPACKLO_VECTOR4I16( a, b ) _mm_unpacklo_pi16( a, b )
766 #define AKSIMD_UNPACKHI_VECTOR4I16( a, b ) _mm_unpackhi_pi16( a, b )
770 #define AKSIMD_SHIFTLEFT_V2I32( __vec__, __shiftBy__ ) \
771 _mm_slli_pi32( (__vec__), (__shiftBy__) )
775 #define AKSIMD_SHIFTRIGHTARITH_V2I32( __vec__, __shiftBy__ ) \
776 _mm_srai_pi32( (__vec__), (__shiftBy__) )
780 #define AKSIMD_MMX_EMPTY _mm_empty();
785 #endif //_AK_SIMD_SSE_H_