バージョン

menu_open
Wwise SDK 2024.1.1
AkSimdMath.h
[詳解]
1 /***********************************************************************
2  The content of this file includes source code for the sound engine
3  portion of the AUDIOKINETIC Wwise Technology and constitutes "Level
4  Two Source Code" as defined in the Source Code Addendum attached
5  with this file. Any use of the Level Two Source Code shall be
6  subject to the terms and conditions outlined in the Source Code
7  Addendum and the End User License Agreement for Wwise(R).
8 
9  Copyright (c) 2024 Audiokinetic Inc.
10  ***********************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////
13 //
14 // AkSimdMath.h
15 //
16 // Library of static functions for math computations with SIMD in mind.
17 //
18 //////////////////////////////////////////////////////////////////////
19 #ifndef _AKSIMDMATH_H_
20 #define _AKSIMDMATH_H_
21 
24 #include <AkMath.h>
25 
26 namespace AkMath
27 {
28  //Take 4 vectors <x,y,z> and return <x,x,x,x>, <y,y,y,y> and <z,z,z,z>
29  AkForceInline void PermuteVectors3(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
30  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz)
31  {
32  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
33  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
34  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
35  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
36 
37  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
38  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
39  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
40  }
41 
42  //Take 3 vectors <x3,x2,x1,x0>, <y,y,y,y> and <z,z,z,z> and return 4 vectors <x,y,z,z>
43  AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32& xxxx, const AKSIMD_V4F32& yyyy, const AKSIMD_V4F32& zzzz,
44  AKSIMD_V4F32& out_v0, AKSIMD_V4F32& out_v1, AKSIMD_V4F32& out_v2, AKSIMD_V4F32& out_v3)
45  {
46  /*__m128 _mm_shuffle_ps(__m128 lo, __m128 hi, _MM_SHUFFLE(hi3, hi2, lo1, lo0))
47  Interleave inputs into low 2 floats and high 2 floats of output.Basically
48  out[0] = lo[lo0];
49  out[1] = lo[lo1];
50  out[2] = hi[hi2];
51  out[3] = hi[hi3];
52  */
53 
54  AKSIMD_V4F32 x0x1y0y1 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(1, 0, 1, 0));
55  AKSIMD_V4F32 z0z1z0z1 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(1, 0, 1, 0));
56 
57  out_v0 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(2, 0, 2, 0));
58  out_v1 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(3, 1, 3, 1));
59 
60  AKSIMD_V4F32 x2x3y2y3 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(3, 2, 3, 2));
61  AKSIMD_V4F32 z2z3z2z3 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(3, 2, 3, 2));
62 
63  out_v2 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(2, 0, 2, 0));
64  out_v3 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(3, 1, 3, 1));
65  }
66 
67  //Take 4 vectors <x,y,z,w> and return <x,x,x,x>, <y,y,y,y>, <z,z,z,z> and <w,w,w,w>
68  AkForceInline void PermuteVectors4(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
69  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz, AKSIMD_V4F32& out_wwww)
70  {
71  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
72  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
73  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
74  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
75 
76  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
77  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
78  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
79  out_wwww = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(3, 1, 3, 1));
80  }
81 
82  // 3-element dot product of 4 vectors.
84  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
85  {
86  return AKSIMD_ADD_V4F32(AKSIMD_ADD_V4F32(AKSIMD_MUL_V4F32(v0_x, v1_x), AKSIMD_MUL_V4F32(v0_y, v1_y)), AKSIMD_MUL_V4F32(v0_z, v1_z));
87  }
88 
89  // 3-element dot product of 1 common vector with 4 vectors
90  AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
91  {
92  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
93  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
94  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
95  return DotPoduct3_4x4(v0_x, v0_y, v0_z, v1_x, v1_y, v1_z);
96  }
97 
98  // 4-element dot product of 4 vectors.
99  AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32& v0_x, const AKSIMD_V4F32& v0_y, const AKSIMD_V4F32& v0_z, const AKSIMD_V4F32& v0_w,
100  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
101  {
102  return AKSIMD_ADD_V4F32(
104  AKSIMD_MUL_V4F32(v0_x, v1_x),
105  AKSIMD_MUL_V4F32(v0_y, v1_y)),
107  AKSIMD_MUL_V4F32(v0_z, v1_z),
108  AKSIMD_MUL_V4F32(v0_w, v1_w)));
109  }
110 
111  // 4-element dot product of 1 common vector with 4 vectors
112  AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
113  {
114  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
115  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
116  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
117  const AKSIMD_V4F32 v0_w = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
118  return DotPoduct4_4x4(v0_x, v0_y, v0_z, v0_w, v1_x, v1_y, v1_z, v1_w);
119  }
120 
121  // 3-element cross product of 4 vectors, returned as XXXX, YYYY, ZZZZ
123  const AKSIMD_V4F32& u_x, const AKSIMD_V4F32& u_y, const AKSIMD_V4F32& u_z,
124  const AKSIMD_V4F32& v_x, const AKSIMD_V4F32& v_y, const AKSIMD_V4F32& v_z,
125  AKSIMD_V4F32& uXv_x, AKSIMD_V4F32& uXv_y, AKSIMD_V4F32& uXv_z
126  )
127  {
128  uXv_x = AKSIMD_SUB_V4F32(AKSIMD_MUL_V4F32(u_y, v_z), AKSIMD_MUL_V4F32(u_z, v_y));
129  uXv_y = AKSIMD_SUB_V4F32(AKSIMD_MUL_V4F32(u_z, v_x), AKSIMD_MUL_V4F32(u_x, v_z));
130  uXv_z = AKSIMD_SUB_V4F32(AKSIMD_MUL_V4F32(u_x, v_y), AKSIMD_MUL_V4F32(u_y, v_x));
131  }
132 
133  // Trig functions approximation (based on the Fast versions found in AkMath.h)
135  {
136  const AKSIMD_V4F32 B = AKSIMD_SET_V4F32(4 / PI);
137  const AKSIMD_V4F32 C = AKSIMD_SET_V4F32(-4 / (PI * PI));
138  const AKSIMD_V4F32 P = AKSIMD_SET_V4F32(0.225f);
139 
140  //float y = B * x + C * x * fabs(x); //float y = X*(B+C*fabs(x));
141 
143  y = AKSIMD_MADD_V4F32(y, C, B);
144  y = AKSIMD_MUL_V4F32(y, x);
145 
146  // return P * (y * fabs(y) - y) + y;
147  AKSIMD_V4F32 sine = AKSIMD_ABS_V4F32(y);
148  sine = AKSIMD_MSUB_V4F32(y, sine, y);
149  sine = AKSIMD_MADD_V4F32(sine, P, y);
150  return sine;
151  }
152 
154  {
155  //Compute the offset needed for the cosinus. If you compare with FastCos, the constants have been combined.
156  const AKSIMD_V4F32 offsetNoWrap = AKSIMD_SET_V4F32(PI / 2); // cos = sin(x+pi/2)
157  const AKSIMD_V4F32 offsetWrap = AKSIMD_SET_V4F32(PI / 2 - 2 * PI); // Wrap: cos(x) = cos(x - 2 pi)
158  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
159 
160  // (cond1 >= cond2) ? a : b
161  AKSIMD_V4F32 offset = AKSIMD_SEL_GTEZ_V4F32(AKSIMD_SUB_V4F32(x, vHalfPI), offsetWrap, offsetNoWrap);
162  return AKSIMD_SIN_V4F32(AKSIMD_ADD_V4F32(x, offset));
163  }
164 
166  {
167  const AKSIMD_V4F32 vNeg = AKSIMD_SET_V4F32(-1.0f);
168  const AKSIMD_V4F32 vOne = AKSIMD_SET_V4F32(1.0f);
169  const AKSIMD_V4F32 vZero = AKSIMD_SET_V4F32(0.0f);
170  const AKSIMD_V4F32 vK = AKSIMD_SET_V4F32(0.28f);
171  const AKSIMD_V4F32 vKRepro = AKSIMD_SET_V4F32(1.f / 0.28f);
172  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
173  const AKSIMD_V4F32 vPI = AKSIMD_SET_V4F32(PI);
174  const AKSIMD_V4F32 vEpsilon = AKSIMD_SET_V4F32(1e-20f);
175 
176  //Ensure x is not zero a == 0 ? b : c.
177  x = AKSIMD_VSEL_V4F32(x, vEpsilon, AKSIMD_EQ_V4F32(x, vZero));
178 
179  AKSIMD_V4F32 z = AKSIMD_DIV_V4F32(y, x);
180  AKSIMD_V4F32 absz = AKSIMD_ABS_V4F32(z);
181  AKSIMD_V4COND zcond = AKSIMD_GTEQ_V4F32(vOne, absz);
182 
183  //The approximation is done in 2 segments of the form: offset + z/a*(z*z + b);
184 
185  //if ( fabsf( z ) < 1.0f ) then use .28 for the a coef
186  AKSIMD_V4F32 a = AKSIMD_VSEL_V4F32(vNeg, vK, zcond);
187 
188  //if ( fabsf( z ) < 1.0f ) then use 1 for the b factor, else use 0.28
189  AKSIMD_V4F32 b = AKSIMD_VSEL_V4F32(vK, vKRepro, zcond);
190 
191  AKSIMD_V4F32 atan = AKSIMD_MADD_V4F32(z, z, b);
192  atan = AKSIMD_MUL_V4F32(atan, a);
193  atan = AKSIMD_DIV_V4F32(z, atan);
194 
195  //Adjust for quadrant
196  // zcond x<0 y<0 offset
197  // 1 0 0 0
198  // 1 0 1 0
199  // 1 1 0 +PI
200  // 1 1 1 -PI
201  // 0 0 0 +PI/2
202  // 0 0 1 -PI/2
203  // 0 1 0 +PI/2
204  // 0 1 1 -PI/2
205 
206  AKSIMD_V4F32 offsetByX = AKSIMD_SEL_GTEZ_V4F32(x, vZero, vPI);
207  AKSIMD_V4F32 offset = AKSIMD_VSEL_V4F32(vHalfPI, offsetByX, zcond);
208  AKSIMD_V4F32 sign = AKSIMD_SEL_GTEZ_V4F32(y, vOne, vNeg);
209 
210  //Apply computed offset.
211  atan = AKSIMD_MADD_V4F32(offset, sign, atan);
212  return atan;
213  }
214 
215  //Accepts any positive x. Compare with FastSqrt() which accepts only between ]0,1]
217  {
219  return AKSIMD_GETELEMENT_V4F32(y, 0);
220  }
221 
222  //Compute 1/sqrt(x)
224  {
226  return AKSIMD_GETELEMENT_V4F32(y, 0);
227  }
228 
230  {
232  return AKSIMD_GETELEMENT_V4F32(y, 0);
233  }
234 }
235 
236 #endif //_AKSIMDMATH_H_
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:511
AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:90
AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:112
AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v0_w, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:99
#define AKSIMD_VSEL_V4F32(__a__, __b__, __c__)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:904
#define AKSIMD_GTEQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:883
#define AKSIMD_RSQRT_V4F32(__a__)
Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
Definition: AkSimd.h:703
AkForceInline AKSIMD_V4F32 AKSIMD_COS_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:153
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats
Definition: AkSimdTypes.h:62
AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32 &xxxx, const AKSIMD_V4F32 &yyyy, const AKSIMD_V4F32 &zzzz, AKSIMD_V4F32 &out_v0, AKSIMD_V4F32 &out_v1, AKSIMD_V4F32 &out_v2, AKSIMD_V4F32 &out_v3)
Definition: AkSimdMath.h:43
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:659
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:528
AkForceInline AKSIMD_V4F32 DotPoduct3_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:83
float AkReal32
32-bit floating point
AkForceInline void CrossProduct3_4x4(const AKSIMD_V4F32 &u_x, const AKSIMD_V4F32 &u_y, const AKSIMD_V4F32 &u_z, const AKSIMD_V4F32 &v_x, const AKSIMD_V4F32 &v_y, const AKSIMD_V4F32 &v_z, AKSIMD_V4F32 &uXv_x, AKSIMD_V4F32 &uXv_y, AKSIMD_V4F32 &uXv_z)
Definition: AkSimdMath.h:122
#define AKSIMD_EQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:898
AkForceInline AkReal32 FastRSqrt(AkReal32 x)
Definition: AkSimdMath.h:223
#define AKSIMD_GETELEMENT_V4F32(__vName, __num__)
Get the element at index num in vector __vName
Definition: AkSimd.h:37
#define AKSIMD_SUB_V4F32(__a__, __b__)
Definition: AkSimd.h:577
AkForceInline AKSIMD_V4F32 AKSIMD_ATAN2_V4F32(AKSIMD_V4F32 y, AKSIMD_V4F32 x)
Definition: AkSimdMath.h:165
#define AKSIMD_MSUB_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:667
#define AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:910
#define AKSIMD_SET_V4F32(__scalar__)
Definition: AkSimd.h:70
AkForceInline void PermuteVectors4(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz, AKSIMD_V4F32 &out_wwww)
Definition: AkSimdMath.h:68
AkForceInline void PermuteVectors3(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz)
Definition: AkSimdMath.h:29
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division
Definition: AkSimd.h:629
AkForceInline AkReal32 FastSqrtLarge(AkReal32 x)
Definition: AkSimdMath.h:216
#define AKSIMD_RECIP_V4F32(__a__)
Reciprocal of x (1/x)
Definition: AkSimd.h:709
#define AKSIMD_ABS_V4F32(__a__)
Returns absolute value
Definition: AkSimd.h:693
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:591
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results
Definition: AkSimdTypes.h:64
AkForceInline AKSIMD_V4F32 AKSIMD_SIN_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:134
AkForceInline AkReal32 FastRcp(AkReal32 x)
Definition: AkSimdMath.h:229
#define AkForceInline
Definition: AkTypes.h:63
#define AKSIMD_SQRT_V4F32(__vec__)
Square root (4 floats)
Definition: AkSimd.h:700
#define AKSIMD_MUL_V4F32(__a__, __b__)
Definition: AkSimd.h:622

このページはお役に立ちましたか?

サポートは必要ですか?

ご質問や問題、ご不明点はございますか?お気軽にお問い合わせください。

サポートページをご確認ください

あなたのプロジェクトについて教えてください。ご不明な点はありませんか。

プロジェクトを登録していただくことで、ご利用開始のサポートをいたします。

Wwiseからはじめよう