33 #ifndef _AK_SIMD_AVX_H_
34 #define _AK_SIMD_AVX_H_
39 #if defined(AKSIMD_AVX_SUPPORTED)
41 #include <immintrin.h>
48 typedef __m256 AKSIMD_V8F32;
49 typedef __m256d AKSIMD_V4F64;
50 typedef __m256i AKSIMD_V8I32;
51 typedef AKSIMD_V8F32 AKSIMD_V8COND;
52 typedef AKSIMD_V8F32 AKSIMD_V8FCOND;
53 typedef AKSIMD_V8I32 AKSIMD_V8ICOND;
66 #define AKSIMD_LOAD_V8F32( __addr__ ) _mm256_loadu_ps( (AkReal32*)(__addr__) )
70 #define AKSIMD_LOAD1_V8F32( __scalar__ ) _mm256_broadcast_ss( &(__scalar__) )
74 #define AKSIMD_LOAD1_V4F64( __scalar__ ) _mm256_castpd_ps(_mm256_broadcast_sd( &(__scalar__) ))
78 #define AKSIMD_SET_V8F32( __scalar__ ) _mm256_set1_ps( (__scalar__) )
81 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
85 #define AKSIMD_SETZERO_V8F32() _mm256_setzero_ps()
90 #define AKSIMD_LOAD_SS_V8F32( __addr__ ) _mm256_zextps128_ps256(_mm_load_ss( (__addr__) ))
96 #define AKSIMD_SET_V2F128( m1, m2) _mm256_setr_m128(m1, m2)
98 #define AKSIMD_INSERT_V2F128( a, m128, idx) _mm256_insertf128_ps(a, m128, idx)
111 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
115 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) )
129 #define AKSIMD_SHUFFLE_V8F32( a, b, i ) _mm256_shuffle_ps( a, b, i )
132 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
135 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
138 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
141 #define AKSIMD_DUP_V8_ODD(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
144 #define AKSIMD_DUP_V8_EVEN(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
147 #define AKSIMD_SHUFFLE_V8I32( a, b, i ) _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), i ))
150 #define AKSIMD_PERMUTEVAR_V8F32(a, b) _mm256_permutevar_ps(a, b)
153 #define AKSIMD_PERMUTE128( l1, l0 ) (((l1) << 4) | (l0))
157 #define AKSIMD_PERMUTE_2X128_V8F32( a, b, i ) _mm256_permute2f128_ps(a, b, i)
160 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0))
163 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1))
166 #define AKSIMD_EXTRACT_V2F128( a, i ) _mm256_extractf128_ps(a, i)
173 AkForceInline void AKSIMD_TRANSPOSE8X4_V8F32(AKSIMD_V8F32& A, AKSIMD_V8F32& B, AKSIMD_V8F32& C, AKSIMD_V8F32& D)
175 AKSIMD_V8F32 tmp1, tmp2, tmp3, tmp4;
197 #define AKSIMD_SUB_V8F32( a, b ) _mm256_sub_ps( a, b )
202 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
206 #define AKSIMD_ADD_V8F32( a, b ) _mm256_add_ps( a, b )
210 #define AKSIMD_ADDSUB_V8F32( a, b ) _mm256_addsub_ps( a, b )
215 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
219 #define AKSIMD_MUL_V8F32( a, b ) _mm256_mul_ps( a, b )
221 #define AKSIMD_DIV_V8F32( a, b ) _mm256_div_ps( a, b )
227 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) )
231 #define AKSIMD_MIN_V8F32( a, b ) _mm256_min_ps( a, b )
235 #define AKSIMD_MAX_V8F32( a, b ) _mm256_max_ps( a, b )
238 #define AKSIMD_ABS_V8F32( a ) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
241 #define AKSIMD_NEG_V8F32( __a__ ) _mm256_xor_ps(_mm256_set1_ps(-0.f), __a__)
244 #define AKSIMD_SQRT_V8F32( __a__ ) _mm256_sqrt_ps( (__a__) )
247 #define AKSIMD_RSQRT_V8F32( __a__ ) _mm256_rsqrt_ps( (__a__) )
250 #define AKSIMD_RECIP_V8F32( __a__ ) _mm256_rcp_ps( (__a__) )
253 #define AKSIMD_CEIL_V8F32( __a__ ) _mm256_ceil_ps( (__a__) )
255 #define AKSIMD_XOR_V8F32( a, b ) _mm256_xor_ps(a,b)
256 #define AKSIMD_OR_V8F32( a, b ) _mm256_or_ps(a,b)
257 #define AKSIMD_AND_V8F32( a, b) _mm256_and_ps(a,b)
258 #define AKSIMD_NOT_V8F32( a ) _mm256_xor_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(~0)))
264 static AkForceInline AKSIMD_V8F32 AKSIMD_HORIZONTALADD_V8F32(AKSIMD_V8F32 vVec)
266 __m256 vAb = _mm256_shuffle_ps(vVec, vVec, 0xB1);
267 __m256 vHaddAb = _mm256_add_ps(vVec, vAb);
268 __m256 vHaddCd = _mm256_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
269 __m256 vHaddAbcd = _mm256_add_ps(vHaddAb, vHaddCd);
270 __m256 vHaddEfgh = _mm256_permute2f128_ps(vHaddAbcd, vHaddAbcd, 0x01);
271 __m256 vHaddAll = _mm256_add_ps(vHaddAbcd, vHaddEfgh);
276 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(
const AKSIMD_V8F32 cIn1,
const AKSIMD_V8F32 cIn2)
278 __m256 real1Ext = _mm256_moveldup_ps(cIn1);
279 __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1);
280 __m256 imag1Ext = _mm256_movehdup_ps(cIn1);
281 __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf);
282 __m256 mul = _mm256_mul_ps(real1Ext, cIn2);
283 __m256 out = _mm256_addsub_ps(mul, temp);
298 #define AKSIMD_UNPACKLO_V8F32( a, b ) _mm256_unpacklo_ps( a, b )
303 #define AKSIMD_UNPACKHI_V8F32( a, b ) _mm256_unpackhi_ps( a, b )
312 #define AKSIMD_CMP_CTRLMASKV8 __m256
315 #define AKSIMD_LTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LE_OS )
317 #define AKSIMD_LT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LT_OS )
320 #define AKSIMD_GTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GE_OS )
322 #define AKSIMD_GT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GT_OS )
325 #define AKSIMD_EQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_EQ_OS )
328 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
330 return _mm256_blendv_ps(vA, vB, vMask);
334 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) )
337 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) )
339 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
341 #define AKSIMD_MASK_V8F32( __a__ ) _mm256_movemask_ps( __a__ )
344 #define AKSIMD_TESTZERO_V8I32( __a__ ) (_mm256_testz_si256(__a__,__a__) != 0)
345 #define AKSIMD_TESTZERO_V8F32( __a__) AKSIMD_TESTZERO_V8I32(_mm256_castps_si256(__a__))
348 #define AKSIMD_TESTONES_V8I32(__a__) (_mm256_testc_si256(__a__, _mm256_set1_epi32(~0)) != 0)
349 #define AKSIMD_TESTONES_V8F32( __a__) AKSIMD_TESTONES_V8I32(_mm256_castps_si256(__a__))
356 #define AKSIMD_LOAD_V8I32( __addr__ ) _mm256_loadu_si256( (__addr__) )
359 #define AKSIMD_SETZERO_V8I32() _mm256_setzero_si256()
362 #define AKSIMD_SET_V8I32( __scalar__ ) _mm256_set1_epi32( (__scalar__) )
365 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
371 #define AKSIMD_SET_V2I128(m1, m2) _mm256_setr_m128i(m1, m2)
376 #define AKSIMD_STORE_V8I32( __addr__, __vec__ ) _mm256_storeu_si256( (__addr__), (__vec__) )
384 #define AKSIMD_CONVERT_V8I32_TO_V8F32( __vec__ ) _mm256_cvtepi32_ps( (__vec__) )
388 #define AKSIMD_ROUND_V8F32_TO_V8I32( __vec__ ) _mm256_cvtps_epi32( (__vec__) )
392 #define AKSIMD_TRUNCATE_V8F32_TO_V8I32( __vec__ ) _mm256_cvttps_epi32( (__vec__) )
397 #define AKSIMD_CONVERT_V8F16_TO_V8F32( __vec__ ) _mm256_cvtph_ps( (__vec__) )
402 #define AKSIMD_CONVERT_V8F32_TO_V8F16( __vec__ ) _mm256_cvtps_ph(__vec__, (_MM_FROUND_TO_NEAREST_INT ) )
413 #define AKSIMD_CAST_V4F64_TO_V8F32( __vec__ ) _mm256_castpd_ps(__vec__)
417 #define AKSIMD_CAST_V4F64_TO_V8I32( __vec__ ) _mm256_castpd_si256(__vec__)
421 #define AKSIMD_CAST_V8F32_TO_V4F64( __vec__ ) _mm256_castps_pd(__vec__)
425 #define AKSIMD_CAST_V8F32_TO_V8I32( __vec__ ) _mm256_castps_si256(__vec__)
429 #define AKSIMD_CAST_V8I32_TO_V4F64( __vec__ ) _mm256_castsi256_pd(__vec__)
433 #define AKSIMD_CAST_V8I32_TO_V8F32( __vec__ ) _mm256_castsi256_ps(__vec__)
437 #endif //_AK_SIMD_AVX_H_