34 #if defined _MSC_VER && defined _M_ARM64
35 #include <arm64_neon.h>
47 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
58 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
62 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
66 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
70 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
74 float32x4_t ret = vdupq_n_f32(0);
75 ret = vsetq_lane_f32(d, ret, 3);
76 ret = vsetq_lane_f32(c, ret, 2);
77 ret = vsetq_lane_f32(b, ret, 1);
78 ret = vsetq_lane_f32(a, ret, 0);
83 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
86 int32x4_t ret = vdupq_n_s32(0);
87 ret = vsetq_lane_s32(d, ret, 3);
88 ret = vsetq_lane_s32(c, ret, 2);
89 ret = vsetq_lane_s32(b, ret, 1);
90 ret = vsetq_lane_s32(a, ret, 0);
101 #if defined AK_CPU_ARM_64
102 int64x2_t ret = vdupq_n_s64(0);
103 ret = vsetq_lane_s64(b, ret, 1);
104 ret = vsetq_lane_s64(a, ret, 0);
105 return vreinterpretq_s32_s64(ret);
107 int32x4_t ret = vdupq_n_s32(0);
108 ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
109 ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
110 ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
111 ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
118 #if defined AK_CPU_ARM_64
119 float64x2_t ret = (float64x2_t)vdupq_n_s64(0);
120 ret = vsetq_lane_f64(b, ret, 1);
121 ret = vsetq_lane_f64(a, ret, 0);
122 return (float32x4_t)(ret);
124 int64_t a64 = *(int64_t*)&a;
125 int64_t b64 = *(int64_t*)&b;
126 int32x4_t ret = vdupq_n_s32(0);
127 ret = vsetq_lane_s32(int32_t((b64 >> 32) & 0xFFFFFFFF), ret, 3);
128 ret = vsetq_lane_s32(int32_t((b64 >> 0) & 0xFFFFFFFF), ret, 2);
129 ret = vsetq_lane_s32(int32_t((a64 >> 32) & 0xFFFFFFFF), ret, 1);
130 ret = vsetq_lane_s32(int32_t((a64 >> 0) & 0xFFFFFFFF), ret, 0);
131 return vreinterpretq_f32_s32(ret);
137 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
142 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 )
145 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
148 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
151 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
154 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
156 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
159 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
160 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
163 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
166 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
169 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
170 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
173 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
174 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) )
186 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
190 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
194 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
197 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
200 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
203 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
206 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
209 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
210 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
214 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) vst1q_lane_f64( (float64_t*)(__addr__), vreinterpretq_f64_f32(__vec__), 0 )
226 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
231 #if defined AK_CPU_ARM_64
232 return vcvtaq_s32_f32(a);
235 float32x4_t halfPos = vdupq_n_f32(0.5f);
236 float32x4_t halfNeg = vdupq_n_f32(-0.5f);
237 float32x4_t zero = vdupq_n_f32(0.0f);
238 const uint32x4_t signMask = vcgtq_f32(a, zero);
239 const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
240 const float32x4_t aOffset = vaddq_f32(a, signedHalf);
241 return vcvtq_s32_f32(aOffset);
247 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
251 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
255 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
260 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
265 __asm__(
"fcvtl %0.4s, %1.4h" \
270 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
271 float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
272 float32x4_t ret = vcvt_f32_f16(vecf16);
273 uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
274 ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
276 #elif defined(AK_CPU_ARM_64)
277 return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
279 uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
280 uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
281 uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3);
284 uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
285 uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
286 uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
289 uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
290 uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
293 uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
294 uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
295 uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
298 uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
299 float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
300 return assembledFloat;
309 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
313 __asm__(
"fcvtn %1.4h, %1.4s\n" \
314 "\tmov %0.8b, %1.8b" \
319 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
320 float16x4_t ret = vcvt_f16_f32(vec);
321 uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
322 ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
323 return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
324 #elif defined(AK_CPU_ARM_64)
325 return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
327 uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
328 uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
332 float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
333 uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
336 uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF);
337 uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
338 uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
339 uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31);
340 uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb);
341 uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
344 uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23);
345 uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
347 uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
350 uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000));
351 uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
352 uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec));
353 uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000));
354 uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000));
355 uint32x4_t infNanData = vorrq_u32(infData, nantissaBit);
357 uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData);
360 uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
363 uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
364 return vreinterpretq_s32_u16(resultZip.val[1]);
377 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
381 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
385 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
389 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
393 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
397 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
399 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
401 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
404 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) (AKSIMD_V4COND)(__vec__)
407 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) (AKSIMD_V4COND)(__vec__)
418 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
422 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
423 vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
426 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
428 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
429 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
431 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
432 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
434 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
438 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
439 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
440 uint32x4_t res = veorq_u32(t0, t1);
441 return vreinterpretq_f32_u32(res);
446 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
447 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
448 uint32x4_t res = vorrq_u32(t0, t1);
449 return vreinterpretq_f32_u32(res);
454 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
455 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
456 uint32x4_t res = vandq_u32(t0, t1);
457 return vreinterpretq_f32_u32(res);
462 uint32x4_t allSet = vdupq_n_u32(~0u);
463 uint32x4_t reinterpret = vreinterpretq_u32_f32(in_vec);
464 uint32x4_t result = veorq_u32(reinterpret, allSet);
465 return vreinterpretq_f32_u32(result);
468 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
469 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
471 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
482 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
483 vshlq_n_s32( (__vec__), (__shiftBy__) )
487 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
488 vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
492 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
493 vshrq_n_s32( (__vec__), (__shiftBy__) )
505 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
508 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
509 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
517 #if defined(__clang__)
518 #if defined(__has_builtin) && __has_builtin(__builtin_shufflevector)
519 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
520 __builtin_shufflevector(a, b, (zyxw >> 0) & 0x3, (zyxw >> 2) & 0x3, ((zyxw >> 4) & 0x3) + 4, ((zyxw >> 6) & 0x3) + 4 )
524 #ifndef AKSIMD_SHUFFLE_V4F32
525 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
526 _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
534 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
541 #define AKSIMD_MOVEHL_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__b__, __a__, AKSIMD_SHUFFLE(3,2,3,2))
548 #define AKSIMD_MOVELH_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__a__, __b__, AKSIMD_SHUFFLE(1,0,1,0))
551 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
554 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
557 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
560 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
563 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
574 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
578 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
583 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
584 vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
588 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
592 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
595 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
598 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
601 #define AKSIMD_MULLO_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
605 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
609 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
614 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
615 vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
619 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
623 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
630 inv = vmulq_f32(restep, inv);
631 return vmulq_f32(a, inv);
636 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
640 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
646 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
647 vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
650 #if defined(AK_CPU_ARM_64)
651 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vfmaq_f32( (__c__), (__a__), (__b__) )
652 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) vfma_f32( (__c__), (__a__), (__b__) )
653 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vfmaq_n_f32( (__c__), (__a__), (__b__) )
654 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vfma_n_f32( (__c__), (__a__), (__b__) )
656 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
657 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
658 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
659 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
664 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
665 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
675 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
679 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
683 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
687 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
690 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
693 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
694 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
697 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
700 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
703 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
706 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
724 #ifdef AKSIMD_DECLARE_V4F32
727 static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
730 float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2);
732 float32x4_t vC1Rev = vrev64q_f32(vCIn1);
733 float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
742 #ifdef AKSIMD_DECLARE_V4F32
745 static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
748 return vaddq_f32(vIn1, vIn2SignFlip);
761 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
765 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
772 float32x2_t xy = vget_low_f32( in_vec1 );
773 float32x2_t ab = vget_low_f32( in_vec2 );
774 float32x2x2_t xa_yb = vtrn_f32( xy, ab );
775 AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
784 float32x2_t zw = vget_high_f32( in_vec1 );
785 float32x2_t cd = vget_high_f32( in_vec2 );
786 float32x2x2_t zc_wd = vtrn_f32( zw, cd );
787 AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
795 int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
796 int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
797 int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
798 return vreinterpretq_s32_s16( result );
803 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
807 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
809 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
812 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
830 ret16 = vld2_lane_s16(addr0, ret16, 0);
831 ret16 = vld2_lane_s16(addr1, ret16, 1);
832 ret16 = vld2_lane_s16(addr2, ret16, 2);
833 ret16 = vld2_lane_s16(addr3, ret16, 3);
836 vmovl_s16(ret16.val[0]),
837 vmovl_s16(ret16.val[1])
863 ret16 = vld4_lane_s16(addr0, ret16, 0);
864 ret16 = vld4_lane_s16(addr1, ret16, 1);
865 ret16 = vld4_lane_s16(addr2, ret16, 2);
866 ret16 = vld4_lane_s16(addr3, ret16, 3);
869 vmovl_s16(ret16.val[0]),
870 vmovl_s16(ret16.val[1]),
871 vmovl_s16(ret16.val[2]),
872 vmovl_s16(ret16.val[3])
885 #define AKSIMD_CMP_CTRLMASK uint32x4_t
888 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
891 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
894 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
897 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
900 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
903 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
906 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
909 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
912 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
915 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
917 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
921 #ifdef AKSIMD_DECLARE_V4F32
923 static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
925 static const uint32x4_t movemask = { 1, 2, 4, 8 };
926 static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
929 uint32x4_t t0 = in_vec1;
930 uint32x4_t t1 = vtstq_u32(t0, highbit);
931 uint32x4_t t2 = vandq_u32(t1, movemask);
932 uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
933 return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
946 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
947 uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
948 return maxValue == 0;
950 int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
951 return vget_lane_s64(orReduce, 0) == 0;
954 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
955 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
959 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
960 uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
961 return minValue == ~0;
963 int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
964 return vget_lane_s64(andReduce, 0) == ~0LL;
967 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
968 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))