32 #ifndef _AK_SIMD_AVX_H_
33 #define _AK_SIMD_AVX_H_
38 #if defined(AKSIMD_AVX_SUPPORTED)
40 #include <immintrin.h>
47 typedef __m256 AKSIMD_V8F32;
48 typedef __m256d AKSIMD_V4F64;
49 typedef __m256i AKSIMD_V8I32;
50 typedef AKSIMD_V8F32 AKSIMD_V8COND;
51 typedef AKSIMD_V8F32 AKSIMD_V8FCOND;
52 typedef AKSIMD_V8I32 AKSIMD_V8ICOND;
65 #define AKSIMD_LOAD_V8F32( __addr__ ) _mm256_loadu_ps( (AkReal32*)(__addr__) )
69 #define AKSIMD_LOAD1_V8F32( __scalar__ ) _mm256_broadcast_ss( &(__scalar__) )
73 #define AKSIMD_LOAD1_V4F64( __scalar__ ) _mm256_castpd_ps(_mm256_broadcast_sd( &(__scalar__) ))
77 #define AKSIMD_SET_V8F32( __scalar__ ) _mm256_set1_ps( (__scalar__) )
80 #define AKSIMD_SETV_V8F32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_ps( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
83 #define AKSIMD_SETV_V4F64( _d, _c, _b, _a ) _mm256_castpd_ps( _mm256_set_pd( (_d), (_c), (_b), (_a) ) )
87 #define AKSIMD_SETZERO_V8F32() _mm256_setzero_ps()
92 #define AKSIMD_LOAD_SS_V8F32( __addr__ ) _mm256_zextps128_ps256(_mm_load_ss( (__addr__) ))
98 #define AKSIMD_SETV_V2F128( m2, m1) _mm256_set_m128(m2, m1)
100 #define AKSIMD_INSERT_V2F128( a, m128, idx) _mm256_insertf128_ps(a, m128, idx)
102 #define AKSIMD_GETELEMENT_V8F32( __vName, __num__ ) ((AkReal32*)&(__vName))[(__num__)]
103 #define AKSIMD_GETELEMENT_V4F64( __vName, __num__ ) ((AkReal64*)&(__vName))[(__num__)]
104 #define AKSIMD_GETELEMENT_V8I32( __vName, __num__ ) ((AkInt32*)&(__vName))[(__num__)]
105 #define AKSIMD_GETELEMENT_V4I64( __vName, __num__ ) ((AkInt64*)&(__vName))[(__num__)]
118 #define AKSIMD_STORE_V8F32( __addr__, __vec__ ) _mm256_storeu_ps( (AkReal32*)(__addr__), (__vec__) )
122 #define AKSIMD_STORE1_V8F32( __addr__, __vec__ ) _mm_store_ss( (AkReal32*)(__addr__), _mm256_castps256_ps128( (__vec__) ) )
136 #define AKSIMD_SHUFFLE_V8F32( a, b, i ) _mm256_shuffle_ps( a, b, i )
139 #define AKSIMD_SHUFFLE_V8_BADC( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(2,3,0,1))
142 #define AKSIMD_SHUFFLE_V8_CDAB( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
145 #define AKSIMD_SHUFFLE_V8_BCDA( __a__ ) AKSIMD_SHUFFLE_V8F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
148 #define AKSIMD_DUP_V8_ODD(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
151 #define AKSIMD_DUP_V8_EVEN(__vv) AKSIMD_SHUFFLE_V8F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
154 #define AKSIMD_SHUFFLE_V8I32( a, b, i ) _mm256_castps_si256(_mm256_shuffle_ps( _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), i ))
157 #define AKSIMD_PERMUTEVAR_V8F32(a, b) _mm256_permutevar_ps(a, b)
160 #define AKSIMD_PERMUTE128( l1, l0 ) (((l1) << 4) | (l0))
164 #define AKSIMD_PERMUTE_2X128_V8F32( a, b, i ) _mm256_permute2f128_ps(a, b, i)
167 #define AKSIMD_DEINTERLEAVELANES_LO_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(2, 0))
170 #define AKSIMD_DEINTERLEAVELANES_HI_V8F32( a, b ) AKSIMD_PERMUTE_2X128_V8F32(a, b, AKSIMD_PERMUTE128(3, 1))
173 #define AKSIMD_EXTRACT_V2F128( a, i ) _mm256_extractf128_ps(a, i)
180 AkForceInline void AKSIMD_TRANSPOSE8X4_V8F32(AKSIMD_V8F32& A, AKSIMD_V8F32& B, AKSIMD_V8F32& C, AKSIMD_V8F32& D)
182 AKSIMD_V8F32 tmp1, tmp2, tmp3, tmp4;
204 #define AKSIMD_SUB_V8F32( a, b ) _mm256_sub_ps( a, b )
209 #define AKSIMD_SUB_SS_V8F32( a, b ) _mm256_sub_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
213 #define AKSIMD_ADD_V8F32( a, b ) _mm256_add_ps( a, b )
217 #define AKSIMD_ADDSUB_V8F32( a, b ) _mm256_addsub_ps( a, b )
222 #define AKSIMD_ADD_SS_V8F32( a, b ) _mm256_add_ps( a, _mm256_and_ps(b, _mm256_setr_epi32( -1, 0, 0, 0, 0, 0, 0, 0 ) ) )
226 #define AKSIMD_MUL_V8F32( a, b ) _mm256_mul_ps( a, b )
228 #define AKSIMD_DIV_V8F32( a, b ) _mm256_div_ps( a, b )
234 #define AKSIMD_MUL_SS_V8F32( a, b ) _mm256_mul_ps( a, _mm256_blend_ps(b, _mm256_set1_ps(1.0f), 0xfe ) )
238 #define AKSIMD_MIN_V8F32( a, b ) _mm256_min_ps( a, b )
242 #define AKSIMD_MAX_V8F32( a, b ) _mm256_max_ps( a, b )
245 #define AKSIMD_ABS_V8F32( a ) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
248 #define AKSIMD_NEG_V8F32( __a__ ) _mm256_xor_ps(_mm256_set1_ps(-0.f), __a__)
251 #define AKSIMD_SQRT_V8F32( __a__ ) _mm256_sqrt_ps( (__a__) )
254 #define AKSIMD_RSQRT_V8F32( __a__ ) _mm256_rsqrt_ps( (__a__) )
257 #define AKSIMD_RECIP_V8F32( __a__ ) _mm256_rcp_ps( (__a__) )
260 #define AKSIMD_CEIL_V8F32( __a__ ) _mm256_ceil_ps( (__a__) )
262 #define AKSIMD_XOR_V8F32( a, b ) _mm256_xor_ps(a,b)
263 #define AKSIMD_OR_V8F32( a, b ) _mm256_or_ps(a,b)
264 #define AKSIMD_AND_V8F32( a, b) _mm256_and_ps(a,b)
265 #define AKSIMD_NOT_V8F32( a ) _mm256_xor_ps(a,_mm256_castsi256_ps(_mm256_set1_epi32(~0)))
266 #define AKSIMD_ANDNOT_V8F32( a, b ) _mm256_andnot_ps(a, b)
272 static AkForceInline AKSIMD_V8F32 AKSIMD_HORIZONTALADD_V8F32(AKSIMD_V8F32 vVec)
274 __m256 vAb = _mm256_shuffle_ps(vVec, vVec, 0xB1);
275 __m256 vHaddAb = _mm256_add_ps(vVec, vAb);
276 __m256 vHaddCd = _mm256_shuffle_ps(vHaddAb, vHaddAb, 0x4E);
277 __m256 vHaddAbcd = _mm256_add_ps(vHaddAb, vHaddCd);
278 __m256 vHaddEfgh = _mm256_permute2f128_ps(vHaddAbcd, vHaddAbcd, 0x01);
279 __m256 vHaddAll = _mm256_add_ps(vHaddAbcd, vHaddEfgh);
284 static AkForceInline AKSIMD_V8F32 AKSIMD_COMPLEXMUL_V8F32(
const AKSIMD_V8F32 cIn1,
const AKSIMD_V8F32 cIn2)
286 __m256 real1Ext = _mm256_moveldup_ps(cIn1);
287 __m256 in2Shuf = _mm256_shuffle_ps(cIn2, cIn2, 0xB1);
288 __m256 imag1Ext = _mm256_movehdup_ps(cIn1);
289 __m256 temp = _mm256_mul_ps(imag1Ext, in2Shuf);
290 __m256 mul = _mm256_mul_ps(real1Ext, cIn2);
291 __m256 out = _mm256_addsub_ps(mul, temp);
306 #define AKSIMD_UNPACKLO_V8F32( a, b ) _mm256_unpacklo_ps( a, b )
311 #define AKSIMD_UNPACKHI_V8F32( a, b ) _mm256_unpackhi_ps( a, b )
320 #define AKSIMD_CMP_CTRLMASKV8 __m256
323 #define AKSIMD_LTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LE_OS )
325 #define AKSIMD_LT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_LT_OS )
328 #define AKSIMD_GTEQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GE_OS )
330 #define AKSIMD_GT_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_GT_OS )
333 #define AKSIMD_EQ_V8F32( __a__, __b__ ) _mm256_cmp_ps( (__a__), (__b__), _CMP_EQ_OS )
336 static AkForceInline AKSIMD_V8F32 AKSIMD_VSEL_V8F32( AKSIMD_V8F32 vA, AKSIMD_V8F32 vB, AKSIMD_V8F32 vMask )
338 return _mm256_blendv_ps(vA, vB, vMask);
342 #define AKSIMD_SEL_GTEQ_V8F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V8F32( __a__, __b__, AKSIMD_GTEQ_V8F32( __cond1__, __cond2__ ) )
345 #define AKSIMD_SEL_GTEZ_V8F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V8F32( (__c__), (__b__), AKSIMD_GTEQ_V8F32( __a__, _mm256_set1_ps(0) ) )
347 #define AKSIMD_SPLAT_V8F32(var, idx) AKSIMD_SHUFFLE_V8F32(var,var, AKSIMD_SHUFFLE(idx,idx,idx,idx))
349 #define AKSIMD_MASK_V8F32( __a__ ) _mm256_movemask_ps( __a__ )
352 #define AKSIMD_TESTZERO_V8I32( __a__ ) (_mm256_testz_si256(__a__,__a__) != 0)
353 #define AKSIMD_TESTZERO_V8F32( __a__) AKSIMD_TESTZERO_V8I32(_mm256_castps_si256(__a__))
356 #define AKSIMD_TESTONES_V8I32(__a__) (_mm256_testc_si256(__a__, _mm256_set1_epi32(~0)) != 0)
357 #define AKSIMD_TESTONES_V8F32( __a__) AKSIMD_TESTONES_V8I32(_mm256_castps_si256(__a__))
364 #define AKSIMD_LOAD_V8I32( __addr__ ) _mm256_loadu_si256( (__addr__) )
367 #define AKSIMD_SETZERO_V8I32() _mm256_setzero_si256()
370 #define AKSIMD_SET_V8I32( __scalar__ ) _mm256_set1_epi32( (__scalar__) )
373 #define AKSIMD_SETV_V8I32( _h, _g, _f, _e, _d, _c, _b, _a ) _mm256_set_epi32( (_h), (_g), (_f), (_e), (_d), (_c), (_b), (_a) )
379 #define AKSIMD_SET_V2I128(m1, m2) _mm256_setr_m128i(m1, m2)
384 #define AKSIMD_STORE_V8I32( __addr__, __vec__ ) _mm256_storeu_si256( (__addr__), (__vec__) )
392 #define AKSIMD_CONVERT_V8I32_TO_V8F32( __vec__ ) _mm256_cvtepi32_ps( (__vec__) )
396 #define AKSIMD_ROUND_V8F32_TO_V8I32( __vec__ ) _mm256_cvtps_epi32( (__vec__) )
400 #define AKSIMD_TRUNCATE_V8F32_TO_V8I32( __vec__ ) _mm256_cvttps_epi32( (__vec__) )
405 #define AKSIMD_CONVERT_V8F16_TO_V8F32( __vec__ ) _mm256_cvtph_ps( (__vec__) )
410 #define AKSIMD_CONVERT_V8F32_TO_V8F16( __vec__ ) _mm256_cvtps_ph(__vec__, (_MM_FROUND_TO_NEAREST_INT ) )
421 #define AKSIMD_CAST_V4F64_TO_V8F32( __vec__ ) _mm256_castpd_ps(__vec__)
425 #define AKSIMD_CAST_V4F64_TO_V8I32( __vec__ ) _mm256_castpd_si256(__vec__)
429 #define AKSIMD_CAST_V8F32_TO_V4F64( __vec__ ) _mm256_castps_pd(__vec__)
433 #define AKSIMD_CAST_V8F32_TO_V8I32( __vec__ ) _mm256_castps_si256(__vec__)
437 #define AKSIMD_CAST_V8I32_TO_V4F64( __vec__ ) _mm256_castsi256_pd(__vec__)
441 #define AKSIMD_CAST_V8I32_TO_V8F32( __vec__ ) _mm256_castsi256_ps(__vec__)
444 #define AKSIMD_CAST_V8COND_TO_V8F32( __vec__ ) (__vec__)
447 #define AKSIMD_CAST_V8F32_TO_V8COND( __vec__ ) (__vec__)
450 #define AKSIMD_CAST_V8COND_TO_V8I32( __vec__ ) _mm256_castps_si256(__vec__)
453 #define AKSIMD_CAST_V8I32_TO_V8COND( __vec__ ) _mm256_castsi256_ps(__vec__)
457 #endif //_AK_SIMD_AVX_H_