00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00032
00033 #ifndef _AK_SIMD_SSE_H_
00034 #define _AK_SIMD_SSE_H_
00035
00036 #include <AK/SoundEngine/Common/AkTypes.h>
00037 #include <xmmintrin.h>
00038
00041
00042
00043 #define AKSIMD_ARCHCACHELINESIZE (64) ///< Assumed cache line width for architectures on this platform
00044 #define AKSIMD_ARCHMAXPREFETCHSIZE (512) ///< Use this to control how much prefetching maximum is desirable (assuming 8-way cache)
00045
00046 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) _mm_prefetch(((char *)(__add__))+(__offset__), _MM_HINT_NTA )
00047
00049
00050
00053
00054 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
00055
00056
00057
00060
00061
00062 typedef float AKSIMD_F32;
00063 typedef __m128 AKSIMD_V4F32;
00064 typedef AKSIMD_V4F32 AKSIMD_V4COND;
00065 typedef AKSIMD_V4F32 AKSIMD_V4FCOND;
00066
00068
00069
00070
00073
00074
00076 #define AKSIMD_LOAD_V4F32( __addr__ ) _mm_load_ps( (AkReal32*)(__addr__) )
00077
00080 #define AKSIMD_LOADU_V4F32( __addr__ ) _mm_loadu_ps( (__addr__) )
00081
00084 #define AKSIMD_LOAD1_V4F32( __scalar__ ) _mm_load1_ps( &(__scalar__) )
00085
00088 #define AKSIMD_SET_V4F32( __scalar__ ) _mm_set_ps1( (__scalar__) )
00089
00092 #define AKSIMD_SETZERO_V4F32() _mm_setzero_ps()
00093
00097 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) _mm_load_ss( (__addr__) )
00098
00100
00101
00102
00105
00106
00109 #define AKSIMD_STORE_V4F32( __addr__, __vec__ ) _mm_store_ps( (AkReal32*)(__addr__), (__vec__) )
00110
00113 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) _mm_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
00114
00117 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), (__vec__) )
00118
00120
00121
00124
00125
00126
00127 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) _MM_SHUFFLE( (fp3), (fp2), (fp1), (fp0) )
00128
00131
00132 #define AKSIMD_SHUFFLE_V4F32( a, b, i ) _mm_shuffle_ps( a, b, i )
00133
00139 #define AKSIMD_MOVEHL_V4F32( a, b ) _mm_movehl_ps( a, b )
00140
00146 #define AKSIMD_MOVELH_V4F32( a, b ) _mm_movelh_ps( a, b )
00147
00149 #define AKSIMD_SHUFFLE_BADC( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(2,3,0,1))
00150
00152 #define AKSIMD_SHUFFLE_CDAB( __a__ ) _mm_shuffle_ps( (__a__), (__a__), _MM_SHUFFLE(1,0,3,2))
00153
00155 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), _MM_SHUFFLE(0,3,2,1))
00156
00158 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
00159
00161 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
00162
00164
00165
00166
00169
00170
00173 #define AKSIMD_SUB_V4F32( a, b ) _mm_sub_ps( a, b )
00174
00178 #define AKSIMD_SUB_SS_V4F32( a, b ) _mm_sub_ss( a, b )
00179
00182 #define AKSIMD_ADD_V4F32( a, b ) _mm_add_ps( a, b )
00183
00187 #define AKSIMD_ADD_SS_V4F32( a, b ) _mm_add_ss( a, b )
00188
00191 #define AKSIMD_MUL_V4F32( a, b ) _mm_mul_ps( a, b )
00192
00193 #define AKSIMD_DIV_V4F32( a, b ) _mm_div_ps( a, b )
00194
00199 #define AKSIMD_MUL_SS_V4F32( a, b ) _mm_mul_ss( a, b )
00200
00202 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) _mm_add_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00203 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) _mm_sub_ps( _mm_mul_ps( (__a__), (__b__) ), (__c__) )
00204
00206 #define AKSIMD_MADD_SS_V4F32( __a__, __b__, __c__ ) _mm_add_ss( _mm_mul_ss( (__a__), (__b__) ), (__c__) )
00207
00210 #define AKSIMD_MIN_V4F32( a, b ) _mm_min_ps( a, b )
00211
00214 #define AKSIMD_MAX_V4F32( a, b ) _mm_max_ps( a, b )
00215
00217 #define AKSIMD_ABS_V4F32( a ) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
00218
00220 #define AKSIMD_NEG_V4F32( __a__ ) _mm_xor_ps(_mm_set1_ps(-0.f), __a__)
00221
00223 #define AKSIMD_SQRT_V4F32( __a__ ) _mm_sqrt_ps( (__a__) )
00224
00229 static AkForceInline void AKSIMD_HORIZONTALADD(AKSIMD_V4F32 & vVec)
00230 {
00231 __m128 vHighLow = _mm_movehl_ps(vVec, vVec);
00232 vVec = _mm_add_ps(vVec, vHighLow);
00233 vHighLow = _mm_shuffle_ps(vVec, vVec, 0x55);
00234 vVec = _mm_add_ps(vVec, vHighLow);
00235 }
00236
00237 static AkForceInline AKSIMD_V4F32 AKSIMD_DOTPRODUCT( AKSIMD_V4F32 & vVec, const AKSIMD_V4F32 & vfSigns )
00238 {
00239 AKSIMD_V4F32 vfDotProduct = AKSIMD_MUL_V4F32( vVec, vfSigns );
00240 AKSIMD_HORIZONTALADD( vfDotProduct );
00241 return AKSIMD_SHUFFLE_V4F32( vfDotProduct, vfDotProduct, AKSIMD_SHUFFLE(0,0,0,0) );
00242 }
00243
00245 static AkForceInline AKSIMD_V4F32 AKSIMD_COMPLEXMUL( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00246 {
00247 static const AKSIMD_V4F32 vSign = { -1.f, 1.f, -1.f, 1.f };
00248
00249 AKSIMD_V4F32 vTmp1 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(2,2,0,0));
00250 vTmp1 = AKSIMD_MUL_V4F32( vTmp1, vCIn2 );
00251 AKSIMD_V4F32 vTmp2 = _mm_shuffle_ps( vCIn1, vCIn1, _MM_SHUFFLE(3,3,1,1));
00252 vTmp2 = AKSIMD_MUL_V4F32( vTmp2, vSign );
00253 vTmp2 = AKSIMD_MADD_V4F32( vTmp2, AKSIMD_SHUFFLE_BADC( vCIn2 ), vTmp1 );
00254 return vTmp2;
00255 }
00256
00257 #ifdef AK_SSE3
00258
00259 #include <pmmintrin.h>
00260
00262 static AKSIMD_V4F32 AKSIMD_COMPLEXMUL_SSE3( const AKSIMD_V4F32 vCIn1, const AKSIMD_V4F32 vCIn2 )
00263 {
00264 AKSIMD_V4F32 vXMM0 = _mm_moveldup_ps(vCIn1);
00265 vXMM0 = AKSIMD_MUL_V4F32(vXMM0, vCIn2);
00266 AKSIMD_V4F32 xMM1 = _mm_shuffle_ps(vCIn2, vCIn2, 0xB1);
00267 AKSIMD_V4F32 xMM2 = _mm_movehdup_ps(vCIn1);
00268 xMM2 = AKSIMD_MUL_V4F32( xMM2, xMM1);
00269 AKSIMD_V4F32 vCOut = _mm_addsub_ps(vXMM0, xMM2);
00270 return vCOut;
00271 }
00272
00273 #endif
00274
00275 #if defined _MSC_VER && ( _MSC_VER <= 1600 )
00276 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE(dummy) == _MM_FLUSH_ZERO_ON )
00277 #else
00278 #define AKSIMD_ASSERTFLUSHZEROMODE AKASSERT( _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON )
00279 #endif
00280
00282
00283
00284
00287
00288
00290 #define AKSIMD_ADD_V4I32( a, b ) _mm_add_epi32( a, b )
00291
00293
00294
00295
00298
00299
00302 #define AKSIMD_UNPACKLO_V4F32( a, b ) _mm_unpacklo_ps( a, b )
00303
00306 #define AKSIMD_UNPACKHI_V4F32( a, b ) _mm_unpackhi_ps( a, b )
00307
00309
00310
00314
00315
00316 #define AKSIMD_CMP_CTRLMASK __m128
00317
00319 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) _mm_cmple_ps( (__a__), (__b__) )
00320
00322 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) _mm_cmpge_ps( (__a__), (__b__) )
00323
00325 #define AKSIMD_EQ_V4F32( __a__, __b__ ) _mm_cmpeq_ps( (__a__), (__b__) )
00326
00328 static AkForceInline AKSIMD_V4F32 AKSIMD_VSEL_V4F32( AKSIMD_V4F32 vA, AKSIMD_V4F32 vB, AKSIMD_V4F32 vMask )
00329 {
00330 vB = _mm_and_ps( vB, vMask );
00331 vA= _mm_andnot_ps( vMask, vA );
00332 return _mm_or_ps( vA, vB );
00333 }
00334
00335
00336 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
00337
00338
00339 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, _mm_set1_ps(0) ) )
00340
00341 #define AKSIMD_SPLAT_V4F32(var, idx) AKSIMD_SHUFFLE_V4F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
00342
00344
00345
00346 #include <emmintrin.h>
00347
00348 typedef __m128i AKSIMD_V4I32;
00349
00350 typedef AKSIMD_V4I32 AKSIMD_V4ICOND;
00351
00353 #define AKSIMD_LOADU_V4I32( __addr__ ) _mm_loadu_si128( (__addr__) )
00354
00356 #define AKSIMD_LOAD_V4I32( __addr__ ) _mm_load_si128( (__addr__) )
00357
00359 #define AKSIMD_SETZERO_V4I32() _mm_setzero_si128()
00360
00361 #define AKSIMD_SET_V4I32( __scalar__ ) _mm_set1_epi32( (__scalar__) )
00362
00363 #define AKSIMD_SETV_V4I32( _d, _c, _b, _a ) _mm_set_epi32( (_d), (_c), (_b), (_a) )
00364
00366 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) _mm_store_si128( (__addr__), (__vec__) )
00367
00370 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) _mm_storeu_si128( (__addr__), (__vec__) )
00371
00374
00375
00378 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) _mm_cvtepi32_ps( (__vec__) )
00379
00382 #define AKSIMD_CONVERT_V4F32_TO_V4I32( __vec__ ) _mm_cvtps_epi32( (__vec__) )
00383
00386 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) _mm_cvttps_epi32( (__vec__) )
00387
00390 #define AKSIMD_AND_V4I32( __a__, __b__ ) _mm_and_si128( (__a__), (__b__) )
00391
00394 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) _mm_cmpgt_epi16( (__a__), (__b__) )
00395
00397
00398
00401 #define AKSIMD_UNPACKLO_VECTOR8I16( a, b ) _mm_unpacklo_epi16( a, b )
00402
00405 #define AKSIMD_UNPACKHI_VECTOR8I16( a, b ) _mm_unpackhi_epi16( a, b )
00406
00409 #define AKSIMD_PACKS_V4I32( a, b ) _mm_packs_epi32( a, b )
00410
00413
00414
00417 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
00418 _mm_slli_epi32( (__vec__), (__shiftBy__) )
00419
00422 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
00423 _mm_srai_epi32( (__vec__), (__shiftBy__) )
00424
00426
00427
00428 #if defined( AK_CPU_X86 ) && !defined(AK_IOS) /// MMX
00429
00430 typedef __m64 AKSIMD_V2F32;
00431
00432 #endif
00433
00434
00435 #endif //_AK_SIMD_SSE_H_