32 #ifndef _AKSIMD_ARM_NEON_H_
33 #define _AKSIMD_ARM_NEON_H_
35 #if defined _MSC_VER && defined _M_ARM64
36 #include <arm64_neon.h>
43 #define AKSIMD_ARCHMAXPREFETCHSIZE (512)
44 #define AKSIMD_ARCHCACHELINESIZE (64)
45 #if defined __clang__ || defined __GNUC__
46 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ ) __builtin_prefetch(((char *)(__add__))+(__offset__))
48 #define AKSIMD_PREFETCHMEMORY( __offset__, __add__ )
55 #define AKSIMD_ALIGNSIZE( __Size__ ) (((__Size__) + 15) & ~15)
78 #if defined(AK_CPU_ARM_NEON)
96 #define AKSIMD_LOAD_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
100 #define AKSIMD_LOADU_V4F32( __addr__ ) vld1q_f32( (float32_t*)(__addr__) )
104 #define AKSIMD_LOAD1_V4F32( __scalar__ ) vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
108 #define AKSIMD_SET_V4F32( __scalar__ ) vdupq_n_f32( __scalar__ )
112 float32x4_t ret = vdupq_n_f32(0);
113 ret = vsetq_lane_f32(d, ret, 3);
114 ret = vsetq_lane_f32(c, ret, 2);
115 ret = vsetq_lane_f32(b, ret, 1);
116 ret = vsetq_lane_f32(a, ret, 0);
121 #define AKSIMD_SET_V4I32( __scalar__ ) vdupq_n_s32( __scalar__ )
124 int32x4_t ret = vdupq_n_s32(0);
125 ret = vsetq_lane_s32(d, ret, 3);
126 ret = vsetq_lane_s32(c, ret, 2);
127 ret = vsetq_lane_s32(b, ret, 1);
128 ret = vsetq_lane_s32(a, ret, 0);
139 #if defined AK_CPU_ARM_64
140 int64x2_t ret = vdupq_n_s64(0);
141 ret = vsetq_lane_s64(b, ret, 1);
142 ret = vsetq_lane_s64(a, ret, 0);
143 return vreinterpretq_s32_s64(ret);
145 int32x4_t ret = vdupq_n_s32(0);
146 ret = vsetq_lane_s32(int32_t((b >> 32) & 0xFFFFFFFF), ret, 3);
147 ret = vsetq_lane_s32(int32_t((b >> 0) & 0xFFFFFFFF), ret, 2);
148 ret = vsetq_lane_s32(int32_t((a >> 32) & 0xFFFFFFFF), ret, 1);
149 ret = vsetq_lane_s32(int32_t((a >> 0) & 0xFFFFFFFF), ret, 0);
156 #if defined AK_CPU_ARM_64
157 float64x2_t ret = (float64x2_t)vdupq_n_s64(0);
158 ret = vsetq_lane_f64(b, ret, 1);
159 ret = vsetq_lane_f64(a, ret, 0);
160 return (float32x4_t)(ret);
162 int64_t a64 = *(int64_t*)&a;
163 int64_t b64 = *(int64_t*)&b;
164 int32x4_t ret = vdupq_n_s32(0);
165 ret = vsetq_lane_s32(int32_t((b64 >> 32) & 0xFFFFFFFF), ret, 3);
166 ret = vsetq_lane_s32(int32_t((b64 >> 0) & 0xFFFFFFFF), ret, 2);
167 ret = vsetq_lane_s32(int32_t((a64 >> 32) & 0xFFFFFFFF), ret, 1);
168 ret = vsetq_lane_s32(int32_t((a64 >> 0) & 0xFFFFFFFF), ret, 0);
169 return vreinterpretq_f32_s32(ret);
175 #define AKSIMD_SETZERO_V4F32() AKSIMD_SET_V4F32( 0 )
180 #define AKSIMD_LOAD_SS_V4F32( __addr__ ) vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
183 #define AKSIMD_LOAD_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__) )
186 #define AKSIMD_LOAD_V8I16( __addr__ ) vld1q_s16( (const int16_t*)(__addr__) )
189 #define AKSIMD_LOAD_V4I16( __addr__ ) vld1_s16( (const int16_t*)(__addr__) )
192 #define AKSIMD_LOADU_V4I32( __addr__ ) vld1q_s32( (const int32_t*)(__addr__))
194 #define AKSIMD_SETZERO_V4I32() vdupq_n_s32( 0 )
197 #define AKSIMD_LOAD_V2F32( __addr__ ) vld1_f32( (float32_t*)(__addr__) )
198 #define AKSIMD_LOAD_V2F32_LANE( __addr__, __vec__, __lane__ ) vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
201 #define AKSIMD_SET_V2F32( __scalar__ ) vdup_n_f32( __scalar__ )
204 #define AKSIMD_SETZERO_V2F32() AKSIMD_SET_V2F32( 0 )
207 #define AKSIMD_LOAD_V4F32X2( __addr__ ) vld2q_f32( (float32_t*)(__addr__) )
208 #define AKSIMD_LOAD_V2F32X2( __addr__ ) vld2_f32( (float32_t*)(__addr__) )
211 #define AKSIMD_LOAD_V2F32X2_LANE( __addr__, __vec__, __lane__ ) vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
212 #define AKSIMD_LOAD_V4F32X4_LANE( __addr__, __vec__, __lane__ ) vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
224 #define AKSIMD_STORE_V4F32( __addr__, __vName__ ) vst1q_f32( (float32_t*)(__addr__), (__vName__) )
228 #define AKSIMD_STOREU_V4F32( __addr__, __vec__ ) vst1q_f32( (float32_t*)(__addr__), (__vec__) )
232 #define AKSIMD_STORE1_V4F32( __addr__, __vec__ ) vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
235 #define AKSIMD_STORE_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
238 #define AKSIMD_STOREU_V4I32( __addr__, __vec__ ) vst1q_s32( (int32_t*)(__addr__), (__vec__) )
241 #define AKSIMD_STOREU_V4UI32( __addr__, __vec__ ) vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
244 #define AKSIMD_STORE_V2F32( __addr__, __vName__ ) vst1_f32( (AkReal32*)(__addr__), (__vName__) )
247 #define AKSIMD_STORE_V4F32X2( __addr__, __vName__ ) vst2q_f32( (float32_t*)(__addr__), (__vName__) )
248 #define AKSIMD_STORE_V2F32X2( __addr__, __vName__ ) vst2_f32( (float32_t*)(__addr__), (__vName__) )
252 #define AKSIMD_STORE1_V2F64( __addr__, __vec__ ) vst1q_lane_f64( (float64_t*)(__addr__), vreinterpretq_f64_f32(__vec__), 0 )
264 #define AKSIMD_CONVERT_V4I32_TO_V4F32( __vec__ ) vcvtq_f32_s32( __vec__ )
269 #if defined AK_CPU_ARM_64
270 return vcvtaq_s32_f32(a);
273 float32x4_t halfPos = vdupq_n_f32(0.5f);
274 float32x4_t halfNeg = vdupq_n_f32(-0.5f);
275 float32x4_t zero = vdupq_n_f32(0.0f);
276 const uint32x4_t signMask = vcgtq_f32(a, zero);
277 const float32x4_t signedHalf = vbslq_f32(signMask, halfPos, halfNeg);
278 const float32x4_t aOffset = vaddq_f32(a, signedHalf);
279 return vcvtq_s32_f32(aOffset);
285 #define AKSIMD_TRUNCATE_V4F32_TO_V4I32( __vec__ ) vcvtq_s32_f32( (__vec__) )
289 #define AKSIMD_CONVERT_V4F16_TO_V4F32_LO(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_low_s32( __vec__)))
293 #define AKSIMD_CONVERT_V4F16_TO_V4F32_HI(__vec__) AKSIMD_CONVERT_V4F16_TO_V4F32_HELPER( vreinterpret_u16_s32(vget_high_s32( __vec__)))
298 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
303 __asm__(
"fcvtl %0.4s, %1.4h" \
308 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
309 float16x4_t vecf16 = vreinterpret_f16_s16(vecs16);
310 float32x4_t ret = vcvt_f32_f16(vecf16);
311 uint32x4_t signData = vshll_n_u16(vand_u16(vecs16, vdup_n_u16(0x8000)), 16);
312 ret = vorrq_u32(vreinterpretq_s32_f32(ret), signData);
314 #elif defined(AK_CPU_ARM_64)
315 return vcvt_f32_f16(vreinterpret_f16_s16(vecs16));
317 uint32x4_t vecExtended = vshlq_n_u32(vmovl_u16(vecs16), 16);
318 uint32x4_t expMantData = vandq_u32(vecExtended, vdupq_n_u32(0x7fff0000));
319 uint32x4_t expMantShifted = vshrq_n_u32(expMantData, 3);
322 uint32x4_t isDenormMask = vcltq_u32(expMantData, vdupq_n_u32(0x03ff0000));
323 uint32x4_t exponentIncrement = vbslq_u32(isDenormMask, vdupq_n_u32(0x38800000), vdupq_n_u32(0x38000000));
324 uint32x4_t postIncrementAdjust = vandq_u32(isDenormMask, vdupq_n_u32(0x38800000));
327 uint32x4_t expMantScaled = vaddq_u32(expMantShifted, exponentIncrement);
328 uint32x4_t expMantAdj = vreinterpretq_u32_f32(vsubq_f32(vreinterpretq_f32_u32(expMantScaled), vreinterpretq_f32_u32(postIncrementAdjust)));
331 uint32x4_t isInfnanMask = vcgtq_u32(expMantData, vdupq_n_u32(0x7bffffff));
332 uint32x4_t infnanExp = vandq_u32(isInfnanMask, vdupq_n_u32(0x7f800000));
333 uint32x4_t expMantWithInfNan = vorrq_u32(expMantAdj, infnanExp);
336 uint32x4_t signData = vandq_u32(vecExtended, vdupq_n_u32(0x80000000));
337 float32x4_t assembledFloat = vreinterpretq_f32_u32(vorrq_u32(signData, expMantWithInfNan));
338 return assembledFloat;
347 #if defined(AK_CPU_ARM_64) && (defined(__GNUC__) && !defined(__llvm__)) && (__GNUC__ < 6 || __GNUC__ == 6 && __GNUC_MINOR__ < 1)
351 __asm__(
"fcvtn %1.4h, %1.4s\n" \
352 "\tmov %0.8b, %1.8b" \
357 #elif defined(AK_CPU_ARM_64) && defined(AK_MAC_OS_X)
358 float16x4_t ret = vcvt_f16_f32(vec);
359 uint16x4_t signData = vshrn_n_u32(vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000)), 16);
360 ret = vorr_u16(vreinterpret_s16_f16(ret), signData);
361 return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(ret), vdup_n_s16(0)));
362 #elif defined(AK_CPU_ARM_64)
363 return vreinterpretq_s32_s16(vcombine_s16(vreinterpret_s16_f16(vcvt_f16_f32(vec)), vdup_n_s16(0)));
365 uint32x4_t signData = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x80000000));
366 uint32x4_t unsignedVec = vandq_u32(vreinterpretq_u32_f32(vec), vdupq_n_u32(0x7fffffff));
370 float32x4_t denormedVec = vaddq_f32(vreinterpretq_f32_u32(unsignedVec), vdupq_n_f32(0.5f));
371 uint32x4_t denormResult = vshlq_n_u32(vreinterpretq_u32_f32(denormedVec), 16);
374 uint32x4_t subnormMagic = vdupq_n_u32(0xC8000FFF);
375 uint32x4_t normRoundPart1 = vaddq_u32(unsignedVec, subnormMagic);
376 uint32x4_t mantLsb = vshlq_n_u32(unsignedVec, 31 - 13);
377 uint32x4_t mantSignExtendLsb = vshrq_n_u32(mantLsb, 31);
378 uint32x4_t normRoundPart2 = vsubq_u32(normRoundPart1, mantSignExtendLsb);
379 uint32x4_t normResult = vshlq_n_u32(normRoundPart2, 3);
382 uint32x4_t normalMinimum = vdupq_n_u32((127 - 14) << 23);
383 uint32x4_t denormMask = vcgtq_u32(normalMinimum, unsignedVec);
385 uint32x4_t nonNanFloat = vbslq_u32(denormMask, denormResult, normResult);
388 uint32x4_t isNotInfNanMask = vcltq_u32(unsignedVec, vdupq_n_u32(0x47800000));
389 uint32x4_t mantissaData = vandq_u32(unsignedVec, vdupq_n_u32(0x007fffff));
390 uint32x4_t isNanMask = vmvnq_u32(vceqq_f32(vec, vec));
391 uint32x4_t nantissaBit = vandq_u32(isNanMask, vdupq_n_u32(0x02000000));
392 uint32x4_t infData = vandq_u32(vmvnq_u32(mantissaData), vdupq_n_u32(0x7c000000));
393 uint32x4_t infNanData = vorrq_u32(infData, nantissaBit);
395 uint32x4_t resultWithInfNan = vbslq_u32(isNotInfNanMask, nonNanFloat, infNanData);
398 uint32x4_t signedResult = vorrq_u32(signData, resultWithInfNan);
401 uint16x8x2_t resultZip = vuzpq_u16(vreinterpretq_u16_u32(signedResult), vdupq_n_u16(0));
402 return vreinterpretq_s32_u16(resultZip.val[1]);
415 #define AKSIMD_CAST_V2F64_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
419 #define AKSIMD_CAST_V2F64_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
423 #define AKSIMD_CAST_V4F32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
427 #define AKSIMD_CAST_V4F32_TO_V4I32( __vec__ ) (int32x4_t)(__vec__)
431 #define AKSIMD_CAST_V4I32_TO_V2F64( __vec__ ) (float64x2_t)(__vec__)
435 #define AKSIMD_CAST_V4I32_TO_V4F32( __vec__ ) (float32x4_t)(__vec__)
437 #define AKSIMD_CAST_V4COND_TO_V4F32( __vec__ ) vreinterpretq_f32_u32(__vec__)
439 #define AKSIMD_CAST_V4F32_TO_V4COND( __vec__ ) vreinterpretq_u32_f32(__vec__)
442 #define AKSIMD_CAST_V4COND_TO_V4I32( __vec__ ) (AKSIMD_V4COND)(__vec__)
445 #define AKSIMD_CAST_V4I32_TO_V4COND( __vec__ ) (AKSIMD_V4COND)(__vec__)
456 #define AKSIMD_AND_V4I32( __a__, __b__ ) vandq_s32( (__a__), (__b__) )
460 #define AKSIMD_CMPGT_V8I16( __a__, __b__ ) \
461 vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
464 #define AKSIMD_CMPLE_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__) )
466 #define AKSIMD_CMPLT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcltq_s32(__a__, __b__))
467 #define AKSIMD_CMPGT_V4I32( __a__, __b__) vreinterpretq_s32_u32(vcgtq_s32(__a__,__b__))
469 #define AKSIMD_OR_V4I32( __a__, __b__ ) vorrq_s32(__a__,__b__)
470 #define AKSIMD_NOT_V4I32( __a__ ) veorq_s32(__a__, vdupq_n_s32(~0u))
472 #define AKSIMD_XOR_V4I32(__a__, __b__) veorq_s32(__a__, __b__)
476 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
477 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
478 uint32x4_t res = veorq_u32(t0, t1);
479 return vreinterpretq_f32_u32(res);
484 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
485 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
486 uint32x4_t res = vorrq_u32(t0, t1);
487 return vreinterpretq_f32_u32(res);
492 uint32x4_t t0 = vreinterpretq_u32_f32(in_vec0);
493 uint32x4_t t1 = vreinterpretq_u32_f32(in_vec1);
494 uint32x4_t res = vandq_u32(t0, t1);
495 return vreinterpretq_f32_u32(res);
500 uint32x4_t allSet = vdupq_n_u32(~0u);
501 uint32x4_t reinterpret = vreinterpretq_u32_f32(in_vec);
502 uint32x4_t result = veorq_u32(reinterpret, allSet);
503 return vreinterpretq_f32_u32(result);
506 #define AKSIMD_OR_V4COND( __a__, __b__ ) vorrq_u32(__a__, __b__)
507 #define AKSIMD_AND_V4COND( __a__, __b__ ) vandq_u32(__a__, __b__)
509 #define AKSIMD_SUB_V4I32(__a__, __b__) vsubq_s32(__a__, __b__)
520 #define AKSIMD_SHIFTLEFT_V4I32( __vec__, __shiftBy__ ) \
521 vshlq_n_s32( (__vec__), (__shiftBy__) )
525 #define AKSIMD_SHIFTRIGHT_V4I32( __vec__, __shiftBy__ ) \
526 vreinterpretq_s32_u32( vshrq_n_u32( vreinterpretq_u32_s32(__vec__), (__shiftBy__) ) )
530 #define AKSIMD_SHIFTRIGHTARITH_V4I32( __vec__, __shiftBy__ ) \
531 vshrq_n_s32( (__vec__), (__shiftBy__) )
543 #define AKSIMD_COMBINE_V2F32( a, b ) vcombine_f32( a, b )
546 #define AKSIMD_SHUFFLE( fp3, fp2, fp1, fp0 ) \
547 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
555 #if defined(__clang__)
556 #if defined(__has_builtin) && __has_builtin(__builtin_shufflevector)
557 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
558 __builtin_shufflevector(a, b, (zyxw >> 0) & 0x3, (zyxw >> 2) & 0x3, ((zyxw >> 4) & 0x3) + 4, ((zyxw >> 6) & 0x3) + 4 )
562 #ifndef AKSIMD_SHUFFLE_V4F32
563 #define AKSIMD_SHUFFLE_V4F32( a, b, zyxw ) \
564 _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
572 #define AKSIMD_SHUFFLE_V4I32( a, b, zyxw ) vreinterpretq_s32_f32(AKSIMD_SHUFFLE_V4F32( vreinterpretq_f32_s32(a), vreinterpretq_f32_s32(b), zyxw ))
579 #define AKSIMD_MOVEHL_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__b__, __a__, AKSIMD_SHUFFLE(3,2,3,2))
586 #define AKSIMD_MOVELH_V4F32( __a__, __b__ ) AKSIMD_SHUFFLE_V4F32(__a__, __b__, AKSIMD_SHUFFLE(1,0,1,0))
589 #define AKSIMD_SHUFFLE_BADC( __a__ ) vrev64q_f32( __a__ )
592 #define AKSIMD_SHUFFLE_CDAB( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(1,0,3,2))
595 #define AKSIMD_SHUFFLE_BCDA( __a__ ) AKSIMD_SHUFFLE_V4F32( (__a__), (__a__), AKSIMD_SHUFFLE(0,3,2,1))
598 #define AKSIMD_DUP_ODD(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
601 #define AKSIMD_DUP_EVEN(__vv) AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
612 #define AKSIMD_SUB_V4F32( __a__, __b__ ) vsubq_f32( (__a__), (__b__) )
616 #define AKSIMD_SUB_V2F32( __a__, __b__ ) vsub_f32( (__a__), (__b__) )
621 #define AKSIMD_SUB_SS_V4F32( __a__, __b__ ) \
622 vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
626 #define AKSIMD_ADD_V4F32( __a__, __b__ ) vaddq_f32( (__a__), (__b__) )
630 #define AKSIMD_ADD_V2F32( __a__, __b__ ) vadd_f32( (__a__), (__b__) )
633 #define AKSIMD_ADD_V4I32( __a__, __b__ ) vaddq_s32( (__a__), (__b__) )
636 #define AKSIMD_MULLO16_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
639 #define AKSIMD_MULLO_V4I32( __a__, __b__ ) vmulq_s32(__a__, __b__)
643 #define AKSIMD_COMP_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__) )
647 #define AKSIMD_COMP_V2F32( __a__, __b__ ) vceq_f32( (__a__), (__b__) )
652 #define AKSIMD_ADD_SS_V4F32( __a__, __b__ ) \
653 vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
657 #define AKSIMD_MUL_V4F32( __a__, __b__ ) vmulq_f32( (__a__), (__b__) )
661 #define AKSIMD_MUL_V4F32_SCALAR( __a__, __b__ ) vmulq_n_f32( (__a__), (__b__) )
668 inv = vmulq_f32(restep, inv);
669 return vmulq_f32(a, inv);
674 #define AKSIMD_MUL_V2F32( __a__, __b__ ) vmul_f32( (__a__), (__b__) )
678 #define AKSIMD_MUL_V2F32_SCALAR( __a__, __b__ ) vmul_n_f32( (__a__), (__b__) )
684 #define AKSIMD_MUL_SS_V4F32( __a__, __b__ ) \
685 vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
688 #if defined(AK_CPU_ARM_64)
689 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vfmaq_f32( (__c__), (__a__), (__b__) )
690 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) vfma_f32( (__c__), (__a__), (__b__) )
691 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vfmaq_n_f32( (__c__), (__a__), (__b__) )
692 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vfma_n_f32( (__c__), (__a__), (__b__) )
694 #define AKSIMD_MADD_V4F32( __a__, __b__, __c__ ) vmlaq_f32( (__c__), (__a__), (__b__) )
695 #define AKSIMD_MADD_V2F32( __a__, __b__, __c__ ) AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
696 #define AKSIMD_MADD_V4F32_SCALAR( __a__, __b__, __c__ ) vmlaq_n_f32( (__c__), (__a__), (__b__) )
697 #define AKSIMD_MADD_V2F32_SCALAR( __a__, __b__, __c__ ) vmla_n_f32( (__c__), (__a__), (__b__) )
702 #define AKSIMD_MSUB_V4F32( __a__, __b__, __c__ ) AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
703 #define AKSIMD_MSUB_V2F32( __a__, __b__, __c__ ) AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
713 #define AKSIMD_MIN_V4F32( __a__, __b__ ) vminq_f32( (__a__), (__b__) )
717 #define AKSIMD_MIN_V2F32( __a__, __b__ ) vmin_f32( (__a__), (__b__) )
721 #define AKSIMD_MAX_V4F32( __a__, __b__ ) vmaxq_f32( (__a__), (__b__) )
725 #define AKSIMD_MAX_V2F32( __a__, __b__ ) vmax_f32( (__a__), (__b__) )
728 #define AKSIMD_ABS_V4F32( __a__ ) vabsq_f32((__a__))
731 #define AKSIMD_NEG_V2F32( __a__ ) vneg_f32( (__a__) )
732 #define AKSIMD_NEG_V4F32( __a__ ) vnegq_f32( (__a__) )
735 #define AKSIMD_SQRT_V4F32( __vec__ ) vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
738 #define AKSIMD_RSQRT_V4F32( __a__ ) vrsqrteq_f32( (__a__) )
741 #define AKSIMD_SQRT_V2F32( __vec__ ) vrecpe_f32( vrsqrte_f32( __vec__ ) )
744 #define AKSIMD_RECIP_V4F32(__a__) vrecpeq_f32(__a__)
762 #ifdef AKSIMD_DECLARE_V4F32
765 static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
768 float32x4x2_t vC2Ext = vtrnq_f32(vCIn2, vCIn2);
770 float32x4_t vC1Rev = vrev64q_f32(vCIn1);
771 float32x4_t vMul = vmulq_f32(vCIn1, vC2Ext.val[0]);
780 #ifdef AKSIMD_DECLARE_V4F32
783 static const AKSIMD_V4F32 vSign = { -0.f, 0.f, -0.f, 0.f };
786 return vaddq_f32(vIn1, vIn2SignFlip);
799 #define AKSIMD_UNPACKLO_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
803 #define AKSIMD_UNPACKHI_VECTOR8I16( __a__, __b__ ) vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
810 float32x2_t xy = vget_low_f32( in_vec1 );
811 float32x2_t ab = vget_low_f32( in_vec2 );
812 float32x2x2_t xa_yb = vtrn_f32( xy, ab );
813 AKSIMD_V4F32 xayb = vcombine_f32( xa_yb.val[0], xa_yb.val[1] );
822 float32x2_t zw = vget_high_f32( in_vec1 );
823 float32x2_t cd = vget_high_f32( in_vec2 );
824 float32x2x2_t zc_wd = vtrn_f32( zw, cd );
825 AKSIMD_V4F32 zcwd = vcombine_f32( zc_wd.val[0], zc_wd.val[1] );
833 int16x4_t vec1_16 = vqmovn_s32( in_vec1 );
834 int16x4_t vec2_16 = vqmovn_s32( in_vec2 );
835 int16x8_t result = vcombine_s16( vec1_16, vec2_16 );
836 return vreinterpretq_s32_s16( result );
841 #define AKSIMD_HILO_V2F32( in_vec1, in_vec2 ) vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
845 #define AKSIMD_TRANSPOSE_V2F32( in_vec1, in_vec2 ) vtrn_f32( in_vec1, in_vec2 )
847 #define AKSIMD_TRANSPOSE_V4F32( in_vec1, in_vec2 ) vtrnq_f32( in_vec1, in_vec2 )
850 #define AKSIMD_SWAP_V2F32( in_vec ) vrev64_f32( in_vec )
868 ret16 = vld2_lane_s16(addr0, ret16, 0);
869 ret16 = vld2_lane_s16(addr1, ret16, 1);
870 ret16 = vld2_lane_s16(addr2, ret16, 2);
871 ret16 = vld2_lane_s16(addr3, ret16, 3);
874 vmovl_s16(ret16.val[0]),
875 vmovl_s16(ret16.val[1])
901 ret16 = vld4_lane_s16(addr0, ret16, 0);
902 ret16 = vld4_lane_s16(addr1, ret16, 1);
903 ret16 = vld4_lane_s16(addr2, ret16, 2);
904 ret16 = vld4_lane_s16(addr3, ret16, 3);
907 vmovl_s16(ret16.val[0]),
908 vmovl_s16(ret16.val[1]),
909 vmovl_s16(ret16.val[2]),
910 vmovl_s16(ret16.val[3])
923 #define AKSIMD_CMP_CTRLMASK uint32x4_t
926 #define AKSIMD_GTEQ_V4F32( __a__, __b__ ) vcgeq_f32( (__a__), (__b__))
929 #define AKSIMD_GT_V4F32( __a__, __b__ ) vcgtq_f32( (__a__), (__b__))
932 #define AKSIMD_LTEQ_V4F32( __a__, __b__ ) vcleq_f32( (__a__), (__b__))
935 #define AKSIMD_LT_V4F32( __a__, __b__ ) vcltq_f32( (__a__), (__b__))
938 #define AKSIMD_GTEQ_V4I32( __a__, __b__ ) vcgeq_s32( (__a__), (__b__))
941 #define AKSIMD_EQ_V4F32( __a__, __b__ ) vceqq_f32( (__a__), (__b__))
944 #define AKSIMD_EQ_V4I32( __a__, __b__ ) vceqq_s32( (__a__), (__b__))
947 #define AKSIMD_VSEL_V4F32( __a__, __b__, __c__ ) vbslq_f32( (__c__), (__b__), (__a__) )
950 #define AKSIMD_SEL_GTEQ_V4F32( __a__, __b__, __cond1__, __cond2__ ) AKSIMD_VSEL_V4F32( __a__, __b__, vcgeq_f32( __cond1__, __cond2__ ) )
953 #define AKSIMD_SEL_GTEZ_V4F32( __a__, __b__, __c__ ) AKSIMD_VSEL_V4F32( (__c__), (__b__), vcgeq_f32( __a__, AKSIMD_SETZERO_V4F32() ) )
955 #define AKSIMD_SPLAT_V4F32(var, idx) vmovq_n_f32(vgetq_lane_f32(var, idx))
959 #ifdef AKSIMD_DECLARE_V4F32
961 static const AKSIMD_DECLARE_V4I32(highbit, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000, (int32_t)0x80000000);
963 static const uint32x4_t movemask = { 1, 2, 4, 8 };
964 static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
967 uint32x4_t t0 = in_vec1;
968 uint32x4_t t1 = vtstq_u32(t0, highbit);
969 uint32x4_t t2 = vandq_u32(t1, movemask);
970 uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
971 return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
984 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vmaxvq_u32)) // vmaxvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
985 uint32_t maxValue = vmaxvq_u32(vreinterpretq_u32_s32(a));
986 return maxValue == 0;
988 int64x1_t orReduce = vorr_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
989 return vget_lane_s64(orReduce, 0) == 0;
992 #define AKSIMD_TESTZERO_V4F32( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_f32(__a__))
993 #define AKSIMD_TESTZERO_V4COND( __a__) AKSIMD_TESTZERO_V4I32(vreinterpretq_s32_u32(__a__))
997 #if defined AK_CPU_ARM_64 && (!defined(_MSC_VER) || defined(vminvq_u32)) // vminvq_u32 is defined only in some versions of MSVC's arm64_neon.h (introduced during Visual Studio 2019)
998 uint32_t minValue = vminvq_u32(vreinterpretq_u32_s32(a));
999 return minValue == ~0;
1001 int64x1_t andReduce = vand_s64(vget_low_s64(vreinterpretq_s64_s32(a)), vget_high_s64(vreinterpretq_s64_s32(a)));
1002 return vget_lane_s64(andReduce, 0) == ~0LL;
1005 #define AKSIMD_TESTONES_V4F32( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_f32(__a__))
1006 #define AKSIMD_TESTONES_V4COND( __a__) AKSIMD_TESTONES_V4I32(vreinterpretq_s32_u32(__a__))
1021 #endif //_AKSIMD_ARM_NEON_H_