
Target Platform(s):

include/AK/SoundEngine/Platforms/arm_neon/AkSimd.h File Reference

Go to the source code of this file.


Platform specific memory size alignment for allocation purposes

#define  AKSIMD_ALIGNSIZE(__Size__)   (((__Size__) + 15) & ~15)
AKSIMD loading / setting

#define  AKSIMD_LOAD_V4F32(__addr__)   vld1q_f32( (float32_t*)(__addr__) )
  Loads four single-precision, floating-point values (see _mm_load_ps).
#define  AKSIMD_LOADU_V4F32(__addr__)   vld1q_f32( (float32_t*)(__addr__) )
#define  AKSIMD_LOAD1_V4F32(__scalar__)   vld1q_dup_f32( (float32_t*)(&(__scalar__)) )
#define  AKSIMD_SET_V4F32(__scalar__)   vdupq_n_f32( __scalar__ )
#define  AKSIMD_SET_V4I32(__scalar__)   vdupq_n_s32( __scalar__ )
  Sets the four integer values to __scalar__.
#define  AKSIMD_SETZERO_V4F32()   AKSIMD_SET_V4F32( 0 )
#define  AKSIMD_LOAD_SS_V4F32(__addr__)   vld1q_lane_f32( (float32_t*)(__addr__), AKSIMD_SETZERO_V4F32(), 0 );
#define  AKSIMD_LOAD_V4I32(__addr__)   vld1q_s32( (const int32_t*)(__addr__) )
  Loads four 32-bit signed integer values (aligned).
#define  AKSIMD_LOAD_V8I16(__addr__)   vld1q_s16( (const int16_t*)(__addr__) )
  Loads 8 16-bit signed integer values (aligned).
#define  AKSIMD_LOAD_V4I16(__addr__)   vld1_s16( (const int16_t*)(__addr__) )
  Loads 4 16-bit signed integer values (aligned).
#define  AKSIMD_LOADU_V4I32(__addr__)   *__addr__
  Loads unaligned 128-bit value (see _mm_loadu_si128).
#define  AKSIMD_SETZERO_V4I32()   vdupq_n_s32( 0 )
  Sets the four 32-bit integer values to zero (see _mm_setzero_si128).
#define  AKSIMD_LOAD_V2F32(__addr__)   vld1_f32( (float32_t*)(__addr__) )
  Loads two single-precision, floating-point values.
#define  AKSIMD_LOAD_V2F32_LANE(__addr__, __vec__, __lane__)   vld1_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
#define  AKSIMD_SET_V2F32(__scalar__)   vdup_n_f32( __scalar__ )
  Sets the two single-precision, floating-point values to __scalar__.
#define  AKSIMD_SETZERO_V2F32()   AKSIMD_SET_V2F32( 0 )
  Sets the two single-precision, floating-point values to zero.
#define  AKSIMD_LOAD_V4F32X2(__addr__)   vld2q_f32( (float32_t*)(__addr__) )
  Loads data from memory and de-interleaves.
#define  AKSIMD_LOAD_V2F32X2(__addr__)   vld2_f32( (float32_t*)(__addr__) )
#define  AKSIMD_LOAD_V2F32X2_LANE(__addr__, __vec__, __lane__)   vld2_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
  Loads data from memory and de-interleaves; only selected lane.
#define  AKSIMD_LOAD_V4F32X4_LANE(__addr__, __vec__, __lane__)   vld4q_lane_f32( (float32_t*)(__addr__), (__vec__), (__lane__) );
AKSIMD storing

#define  AKSIMD_STORE_V4F32(__addr__, __vName__)   vst1q_f32( (float32_t*)(__addr__), (__vName__) )
  Stores four single-precision, floating-point values. The address must be 16-byte aligned.
#define  AKSIMD_STOREU_V4F32(__addr__, __vec__)   vst1q_f32( (float32_t*)(__addr__), (__vec__) )
  Stores four single-precision, floating-point values. The address does not need to be 16-byte aligned.
#define  AKSIMD_STORE1_V4F32(__addr__, __vec__)   vst1q_lane_f32( (float32_t*)(__addr__), (__vec__), 0 )
#define  AKSIMD_STORE_V4I32(__addr__, __vec__)   vst1q_s32( (int32_t*)(__addr__), (__vec__) )
  Stores four 32-bit integer values. The address must be 16-byte aligned.
#define  AKSIMD_STOREU_V4I32(__addr__, __vec__)   vst1q_s32( (int32_t*)(__addr__), (__vec__) )
  Stores four 32-bit integer values. The address does not need to be 16-byte aligned.
#define  AKSIMD_STOREU_V4UI32(__addr__, __vec__)   vst1q_u32( (uint32_t*)(__addr__), (__vec__) )
  Stores four 32-bit unsigned integer values. The address does not need to be 16-byte aligned.
#define  AKSIMD_STORE_V2F32(__addr__, __vName__)   vst1_f32( (AkReal32*)(__addr__), (__vName__) )
  Stores two single-precision, floating-point values. The address must be 16-byte aligned.
#define  AKSIMD_STORE_V4F32X2(__addr__, __vName__)   vst2q_f32( (float32_t*)(__addr__), (__vName__) )
  Stores data by interleaving into memory.
#define  AKSIMD_STORE_V2F32X2(__addr__, __vName__)   vst2_f32( (float32_t*)(__addr__), (__vName__) )
AKSIMD conversion

#define  AKSIMD_CONVERT_V4I32_TO_V4F32(__vec__)   vcvtq_f32_s32( __vec__ )
#define  AKSIMD_CONVERT_V4F32_TO_V4I32(__vec__)   vcvtq_s32_f32( __vec__ )
#define  AKSIMD_TRUNCATE_V4F32_TO_V4I32(__vec__)   vcvtq_s32_f32( (__vec__) )
#define  AKSIMD_CONVERT_V2F32_TO_V2I32(__vec__)   vcvt_s32_f32( __vec__ )
AKSIMD logical operations

#define  AKSIMD_AND_V4I32(__a__, __b__)   vandq_s32( (__a__), (__b__) )
#define  AKSIMD_CMPGT_V8I16(__a__, __b__)   vreinterpretq_s32_u16( vcgtq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ) )
#define  AKSIMD_CMPLE_V4F32(__a__, __b__)   vcleq_f32( (__a__), (__b__) )
  Compares for less than or equal (see _mm_cmple_ps).
AKSIMD shifting

#define  AKSIMD_SHIFTLEFT_V4I32(__vec__, __shiftBy__)   vshlq_n_s32( (__vec__), (__shiftBy__) )
#define  AKSIMD_SHIFTRIGHTARITH_V4I32(__vec__, __shiftBy__)   vrshrq_n_s32( (__vec__), (__shiftBy__) )
AKSIMD vector comparison

Apart from AKSIMD_SEL_GTEQ_V4F32, these implementations are limited to a few platforms.

#define  AKSIMD_CMP_CTRLMASK   uint32x4_t
#define  AKSIMD_GTEQ_V4F32(__a__, __b__)   vcgeq_f32( (__a__), (__b__))
  Compare each float element and return control mask.
#define  AKSIMD_GTEQ_V4I32(__a__, __b__)   vcgeq_s32( (__a__), (__b__))
  Compare each integer element and return control mask.
#define  AKSIMD_EQ_V4F32(__a__, __b__)   vceqq_f32( (__a__), (__b__))
  Compare each float element and return control mask.
#define  AKSIMD_EQ_V4I32(__a__, __b__)   vceqq_s32( (__a__), (__b__))
  Compare each integer element and return control mask.
#define  AKSIMD_VSEL_V4F32(__a__, __b__, __c__)   vbslq_f32( (__c__), (__b__), (__a__) )
  Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usually provided by above comparison operations.
#define  AKSIMD_SEL_GTEQ_V4F32(__a__, __b__, __cond1__, __cond2__)   AKSIMD_VSEL_V4F32( __a__, __b__, AKSIMD_GTEQ_V4F32( __cond1__, __cond2__ ) )
#define  AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)   AKSIMD_VSEL_V4F32( (__c__), (__b__), AKSIMD_GTEQ_V4F32( __a__, AKSIMD_SETZERO_V4F32() ) )
#define  AKSIMD_SPLAT_V4F32(var, idx)   vmovq_n_f32(vgetq_lane_f32(var, idx))


AKSIMD types

typedef int32x4_t  AKSIMD_V4I32
  Vector of 4 32-bit signed integers.
typedef int16x8_t  AKSIMD_V8I16
  Vector of 8 16-bit signed integers.
typedef int16x4_t  AKSIMD_V4I16
  Vector of 4 16-bit signed integers.
typedef uint32x4_t  AKSIMD_V4UI32
  Vector of 4 32-bit unsigned signed integers.
typedef uint32x2_t  AKSIMD_V2UI32
  Vector of 2 32-bit unsigned signed integers.
typedef int32x2_t  AKSIMD_V2I32
  Vector of 2 32-bit signed integers.
typedef float32_t  AKSIMD_F32
  32-bit float
typedef float32x2_t  AKSIMD_V2F32
  Vector of 2 32-bit floats.
typedef float32x4_t  AKSIMD_V4F32
  Vector of 4 32-bit floats.
typedef uint32x4_t  AKSIMD_V4COND
  Vector of 4 comparison results.
typedef uint32x4_t  AKSIMD_V4ICOND
  Vector of 4 comparison results.
typedef uint32x4_t  AKSIMD_V4FCOND
  Vector of 4 comparison results.
typedef float32x2x2_t  AKSIMD_V2F32X2
typedef float32x4x2_t  AKSIMD_V4F32X2
typedef float32x4x4_t  AKSIMD_V4F32X4

AKSIMD shuffling

#define  AKSIMD_COMBINE_V2F32(a, b)   vcombine_f32( a, b )
#define  AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
#define  AKSIMD_SHUFFLE_V4F32(a, b, zyxw)   _AKSIMD_LOCAL::SHUFFLE_V4F32< zyxw >( a, b )
#define  AKSIMD_SHUFFLE_BADC(__a__)   vrev64q_f32( __a__ )
  Swap the 2 lower floats together and the 2 higher floats together.
#define  AKSIMD_SHUFFLE_CDAB(__a__)   vcombine_f32( vget_high_f32(__a__), vget_low_f32(__a__) )
  Swap the 2 lower floats with the 2 higher floats.
#define  AKSIMD_DUP_ODD(__vv)   AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(3,3,1,1))
  Duplicates the odd items into the even items (d c b a -> d d b b ).
#define  AKSIMD_DUP_EVEN(__vv)   AKSIMD_SHUFFLE_V4F32(__vv, __vv, AKSIMD_SHUFFLE(2,2,0,0))
  Duplicates the even items into the odd items (d c b a -> c c a a ).
AKSIMD_V4F32  AKSIMD_MOVEHL_V4F32 (const AKSIMD_V4F32 abcd, const AKSIMD_V4F32 xyzw)
AKSIMD_V4F32  AKSIMD_MOVELH_V4F32 (const AKSIMD_V4F32 &xyzw, const AKSIMD_V4F32 &abcd)

AKSIMD arithmetic

#define  AKSIMD_SUB_V4F32(__a__, __b__)   vsubq_f32( (__a__), (__b__) )
#define  AKSIMD_SUB_V2F32(__a__, __b__)   vsub_f32( (__a__), (__b__) )
#define  AKSIMD_SUB_SS_V4F32(__a__, __b__)   vsubq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) );
#define  AKSIMD_ADD_V4F32(__a__, __b__)   vaddq_f32( (__a__), (__b__) )
#define  AKSIMD_ADD_V2F32(__a__, __b__)   vadd_f32( (__a__), (__b__) )
#define  AKSIMD_ADD_V4I32(__a__, __b__)   vaddq_s32( (__a__), (__b__) )
  Adds the four integers of a and b.
#define  AKSIMD_COMP_V4F32(__a__, __b__)   vceqq_f32( (__a__), (__b__) )
#define  AKSIMD_COMP_V2F32(__a__, __b__)   vceq_f32( (__a__), (__b__) )
#define  AKSIMD_ADD_SS_V4F32(__a__, __b__)   vaddq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
#define  AKSIMD_MUL_V4F32(__a__, __b__)   vmulq_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_V4F32_SCALAR(__a__, __b__)   vmulq_n_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_V2F32(__a__, __b__)   vmul_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_V2F32_SCALAR(__a__, __b__)   vmul_n_f32( (__a__), (__b__) )
#define  AKSIMD_MUL_SS_V4F32(__a__, __b__)   vmulq_f32( (__a__), vsetq_lane_f32( AKSIMD_GETELEMENT_V4F32( (__b__), 0 ), AKSIMD_SETZERO_V4F32(), 0 ) )
#define  AKSIMD_MADD_V4F32(__a__, __b__, __c__)   AKSIMD_ADD_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
  Vector multiply-add operation.
#define  AKSIMD_MSUB_V4F32(__a__, __b__, __c__)   AKSIMD_SUB_V4F32( AKSIMD_MUL_V4F32( (__a__), (__b__) ), (__c__) )
#define  AKSIMD_MADD_V2F32(__a__, __b__, __c__)   AKSIMD_ADD_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
#define  AKSIMD_MSUB_V2F32(__a__, __b__, __c__)   AKSIMD_SUB_V2F32( AKSIMD_MUL_V2F32( (__a__), (__b__) ), (__c__) )
#define  AKSIMD_MADD_V4F32_INST(__a__, __b__, __c__)   vmlaq_f32( (__c__), (__a__), (__b__) )
#define  AKSIMD_MADD_V2F32_INST(__a__, __b__, __c__)   vmla_f32( (__c__), (__a__), (__b__) )
#define  AKSIMD_MADD_V4F32_SCALAR(__a__, __b__, __c__)   vmlaq_n_f32( (__c__), (__a__), (__b__) )
#define  AKSIMD_MADD_V2F32_SCALAR(__a__, __b__, __c__)   vmla_n_f32( (__c__), (__a__), (__b__) )
#define  AKSIMD_MIN_V4F32(__a__, __b__)   vminq_f32( (__a__), (__b__) )
#define  AKSIMD_MIN_V2F32(__a__, __b__)   vmin_f32( (__a__), (__b__) )
#define  AKSIMD_MAX_V4F32(__a__, __b__)   vmaxq_f32( (__a__), (__b__) )
#define  AKSIMD_MAX_V2F32(__a__, __b__)   vmax_f32( (__a__), (__b__) )
#define  AKSIMD_ABS_V4F32(__a__)   vabsq_f32((__a__))
  Returns absolute value.
#define  AKSIMD_NEG_V2F32(__a__)   vneg_f32( (__a__) )
  Changes the sign.
#define  AKSIMD_NEG_V4F32(__a__)   vnegq_f32( (__a__) )
#define  AKSIMD_SQRT_V4F32(__vec__)   vrecpeq_f32( vrsqrteq_f32( __vec__ ) )
  Square root (4 floats).
#define  AKSIMD_SQRT_V2F32(__vec__)   vrecpe_f32( vrsqrte_f32( __vec__ ) )
  Square root (2 floats).
AkForceInline AKSIMD_V4F32  AKSIMD_DIV_V4F32 (AKSIMD_V4F32 a, AKSIMD_V4F32 b)
  Rough estimation of division.
AkForceInline AKSIMD_V4F32  AKSIMD_MADD_SS_V4F32 (const AKSIMD_V4F32 &__a__, const AKSIMD_V4F32 &__b__, const AKSIMD_V4F32 &__c__)
  Vector multiply-add operation.
static AkForceInline void  AKSIMD_HORIZONTALADD (AKSIMD_V4F32 &vVec)
  Cross-platform SIMD multiplication of 2 complex data elements with interleaved real and imaginary parts.

AKSIMD packing / unpacking

#define  AKSIMD_UNPACKLO_VECTOR8I16(__a__, __b__)   vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[0] )
#define  AKSIMD_UNPACKHI_VECTOR8I16(__a__, __b__)   vreinterpretq_s32_s16( vzipq_s16( vreinterpretq_s16_s32(__a__), vreinterpretq_s16_s32(__b__) ).val[1] )
#define  AKSIMD_HILO_V2F32(in_vec1, in_vec2)   vreinterpret_f32_u32( vext_u32( vreinterpret_u32_f32( in_vec1 ), vreinterpret_u32_f32( in_vec2 ), 1 ) )
#define  AKSIMD_TRANSPOSE_V2F32(in_vec1, in_vec2)   vtrn_f32( in_vec1, in_vec2 )
#define  AKSIMD_TRANSPOSE_V4F32(in_vec1, in_vec2)   vtrnq_f32( in_vec1, in_vec2 )
#define  AKSIMD_SWAP_V2F32(in_vec)   vrev64_f32( in_vec )
  V1 = {a,b} => VR = {b,a}.
AkForceInline AKSIMD_V4F32  AKSIMD_UNPACKLO_V4F32 (const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
AkForceInline AKSIMD_V4F32  AKSIMD_UNPACKHI_V4F32 (const AKSIMD_V4F32 &in_vec1, const AKSIMD_V4F32 &in_vec2)
AkForceInline AKSIMD_V4I32  AKSIMD_PACKS_V4I32 (const AKSIMD_V4I32 &in_vec1, const AKSIMD_V4I32 &in_vec2)

Detailed Description

AKSIMD - arm_neon implementation

Definition in file AkSimd.h.

Was this page helpful?

Need Support?

Questions? Problems? Need more info? Contact us, and we can help!

Visit our Support page

Tell us about your project. We're here to help.

Register your project and we'll help you get started with no strings attached!

Get started with Wwise