Wwise SDK 2019.2.15
_ak_simd_avx_8h_source
版本
menu_open
Wwise SDK 2019.2.15
|
AkSimdAvx.h
浏览该文件的文档.
74 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
86 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
87 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
88 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them
104 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
108 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) )
124 /// For each 128b lane, Swap the 2 lower floats together and the 2 higher floats together. ( h g f e d c b a -> g h e f c d a b )
125 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
127 /// For each 128b lane, Swap the 2 lower floats with the 2 higher floats. ( h g f e d c b a -> f e h g b a d c )
128 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
131 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
133 /// For each 128b lane, duplicates the odd items into the even items ( h g f e d c b a -> h h f f d d b b )
136 /// For each 128b lane, duplicates the even items into the odd items ( h g f e d c b a -> g g e e c c a a )
146 /// Selects the lower of each of the 128b lanes in a and b to be the result ( B A ), ( D C ) -> ( C A )
147 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0))
149 /// Selects the higher of each of the 128b lanes in a and b to be the result ( B A ), ( D C) -> ( D B )
150 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1))
168 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
177 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
189 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) )
192 #define AKSIMD_MADD_V8F32( __a__, __b__, __c__ ) _mm256_add_ps( _mm256_mul_ps( (__a__), (__b__) ), (__c__) )
193 #define AKSIMD_MSUB_V8F32( __a__, __b__, __c__ ) _mm256_sub_ps( _mm256_mul_ps( (__a__), (__b__) ), (__c__) )
196 #define AKSIMD_MADD_SS_V8F32( __a__, __b__, __c__ ) AKSIMD_ADD_SS_V8F32( AKSIMD_MUL_SS_V8F32( (__a__), (__b__) ), (__c__) )
229 /// horizontal add across the entire vector - vVec will be updated to contain the sum of every input element of vVec
242 /// Cross-platform SIMD multiplication of 8 complex data elements with interleaved real and imaginary parts
243 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(const AKSIMD_V8F32 cIn1, const AKSIMD_V8F32 cIn2)
246 __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1); // shuf multiplicand (c3, d3, c2, d2, c1, d1, c0, d0)
248 __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf); // temp (b3c3, b3d3, b2c2, b2d2, b1c1, b1d1, b0c0, b0d0)
250 __m256 out = _mm256_addsub_ps(mul, temp); // final (a3d3+b3c3, a3c3-b3d3, a2d2+b2c2, a2c2-b2d2, a1d1+b1c1, a1c1-b1d1, a0d0+b0c0, a0c0-b0d0)
295 /// Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations
296 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
302 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) )
305 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) )
307 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
337 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
340 /// Note that this should be utilized instead of, e.g. adding & utilizing a macro "AKSIMD_INSERT_V8I32(m, i, idx)"
341 /// Because there is no direct corresponding instruction for an insert into 256. You should load into 128s
342 /// and use that. Some compilers do not handle _mm256_insert_epi32 (etc) well, or even include them