バージョン

menu_open
Wwise SDK 2022.1.18
AkSimdMath.h
[詳解]
1 /***********************************************************************
2  The content of this file includes source code for the sound engine
3  portion of the AUDIOKINETIC Wwise Technology and constitutes "Level
4  Two Source Code" as defined in the Source Code Addendum attached
5  with this file. Any use of the Level Two Source Code shall be
6  subject to the terms and conditions outlined in the Source Code
7  Addendum and the End User License Agreement for Wwise(R).
8 
9  Copyright (c) 2024 Audiokinetic Inc.
10  ***********************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////
13 //
14 // AkSimdMath.h
15 //
16 // Library of static functions for math computations with SIMD in mind.
17 //
18 //////////////////////////////////////////////////////////////////////
19 #ifndef _AKSIMDMATH_H_
20 #define _AKSIMDMATH_H_
21 
24 #include <AkMath.h>
25 
26 namespace AkMath
27 {
28  //Take 4 vectors <x,y,z> and return <x,x,x,x>, <y,y,y,y> and <z,z,z,z>
29  AkForceInline void PermuteVectors3(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
30  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz)
31  {
32  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
33  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
34  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
35  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
36 
37  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
38  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
39  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
40  }
41 
42  //Take 3 vectors <x3,x2,x1,x0>, <y,y,y,y> and <z,z,z,z> and return 4 vectors <x,y,z,z>
43  AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32& xxxx, const AKSIMD_V4F32& yyyy, const AKSIMD_V4F32& zzzz,
44  AKSIMD_V4F32& out_v0, AKSIMD_V4F32& out_v1, AKSIMD_V4F32& out_v2, AKSIMD_V4F32& out_v3)
45  {
46  /*__m128 _mm_shuffle_ps(__m128 lo, __m128 hi, _MM_SHUFFLE(hi3, hi2, lo1, lo0))
47  Interleave inputs into low 2 floats and high 2 floats of output.Basically
48  out[0] = lo[lo0];
49  out[1] = lo[lo1];
50  out[2] = hi[hi2];
51  out[3] = hi[hi3];
52  */
53 
54  AKSIMD_V4F32 x0x1y0y1 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(1, 0, 1, 0));
55  AKSIMD_V4F32 z0z1z0z1 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(1, 0, 1, 0));
56 
57  out_v0 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(2, 0, 2, 0));
58  out_v1 = AKSIMD_SHUFFLE_V4F32(x0x1y0y1, z0z1z0z1, AKSIMD_SHUFFLE(3, 1, 3, 1));
59 
60  AKSIMD_V4F32 x2x3y2y3 = AKSIMD_SHUFFLE_V4F32(xxxx, yyyy, AKSIMD_SHUFFLE(3, 2, 3, 2));
61  AKSIMD_V4F32 z2z3z2z3 = AKSIMD_SHUFFLE_V4F32(zzzz, zzzz, AKSIMD_SHUFFLE(3, 2, 3, 2));
62 
63  out_v2 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(2, 0, 2, 0));
64  out_v3 = AKSIMD_SHUFFLE_V4F32(x2x3y2y3, z2z3z2z3, AKSIMD_SHUFFLE(3, 1, 3, 1));
65  }
66 
67  //Take 4 vectors <x,y,z,w> and return <x,x,x,x>, <y,y,y,y>, <z,z,z,z> and <w,w,w,w>
68  AkForceInline void PermuteVectors4(const AKSIMD_V4F32& v0, const AKSIMD_V4F32& v1, const AKSIMD_V4F32& v2, const AKSIMD_V4F32& v3,
69  AKSIMD_V4F32& out_xxxx, AKSIMD_V4F32& out_yyyy, AKSIMD_V4F32& out_zzzz, AKSIMD_V4F32& out_wwww)
70  {
71  AKSIMD_V4F32 xyxy0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(1, 0, 1, 0));
72  AKSIMD_V4F32 xyxy1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(1, 0, 1, 0));
73  out_xxxx = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(2, 0, 2, 0));
74  out_yyyy = AKSIMD_SHUFFLE_V4F32(xyxy0, xyxy1, AKSIMD_SHUFFLE(3, 1, 3, 1));
75 
76  AKSIMD_V4F32 zwzw0 = AKSIMD_SHUFFLE_V4F32(v0, v1, AKSIMD_SHUFFLE(3, 2, 3, 2));
77  AKSIMD_V4F32 zwzw1 = AKSIMD_SHUFFLE_V4F32(v2, v3, AKSIMD_SHUFFLE(3, 2, 3, 2));
78  out_zzzz = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(2, 0, 2, 0));
79  out_wwww = AKSIMD_SHUFFLE_V4F32(zwzw0, zwzw1, AKSIMD_SHUFFLE(3, 1, 3, 1));
80  }
81 
82  // 3-element dot product of 4 vectors.
84  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
85  {
86  return AKSIMD_ADD_V4F32(AKSIMD_ADD_V4F32(AKSIMD_MUL_V4F32(v0_x, v1_x), AKSIMD_MUL_V4F32(v0_y, v1_y)), AKSIMD_MUL_V4F32(v0_z, v1_z));
87  }
88 
89  // 3-element dot product of 1 common vector with 4 vectors
90  AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z)
91  {
92  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
93  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
94  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
95  return DotPoduct3_4x4(v0_x, v0_y, v0_z, v1_x, v1_y, v1_z);
96  }
97 
98  // 4-element dot product of 4 vectors.
99  AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32& v0_x, const AKSIMD_V4F32& v0_y, const AKSIMD_V4F32& v0_z, const AKSIMD_V4F32& v0_w,
100  const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
101  {
102  return AKSIMD_ADD_V4F32(
104  AKSIMD_MUL_V4F32(v0_x, v1_x),
105  AKSIMD_MUL_V4F32(v0_y, v1_y)),
107  AKSIMD_MUL_V4F32(v0_z, v1_z),
108  AKSIMD_MUL_V4F32(v0_w, v1_w)));
109  }
110 
111  // 4-element dot product of 1 common vector with 4 vectors
112  AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32& v0_xyz, const AKSIMD_V4F32& v1_x, const AKSIMD_V4F32& v1_y, const AKSIMD_V4F32& v1_z, const AKSIMD_V4F32& v1_w)
113  {
114  const AKSIMD_V4F32 v0_x = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(0, 0, 0, 0));
115  const AKSIMD_V4F32 v0_y = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(1, 1, 1, 1));
116  const AKSIMD_V4F32 v0_z = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
117  const AKSIMD_V4F32 v0_w = AKSIMD_SHUFFLE_V4F32(v0_xyz, v0_xyz, AKSIMD_SHUFFLE(2, 2, 2, 2));
118  return DotPoduct4_4x4(v0_x, v0_y, v0_z, v0_w, v1_x, v1_y, v1_z, v1_w);
119  }
120 
121  // Trig functions approximation (based on the Fast versions found in AkMath.h)
123  {
124  const AKSIMD_V4F32 B = AKSIMD_SET_V4F32(4 / PI);
125  const AKSIMD_V4F32 C = AKSIMD_SET_V4F32(-4 / (PI * PI));
126  const AKSIMD_V4F32 P = AKSIMD_SET_V4F32(0.225f);
127 
128  //float y = B * x + C * x * fabs(x); //float y = X*(B+C*fabs(x));
129 
131  y = AKSIMD_MADD_V4F32(y, C, B);
132  y = AKSIMD_MUL_V4F32(y, x);
133 
134  // return P * (y * fabs(y) - y) + y;
135  AKSIMD_V4F32 sine = AKSIMD_ABS_V4F32(y);
136  sine = AKSIMD_MSUB_V4F32(y, sine, y);
137  sine = AKSIMD_MADD_V4F32(sine, P, y);
138  return sine;
139  }
140 
142  {
143  //Compute the offset needed for the cosinus. If you compare with FastCos, the constants have been combined.
144  const AKSIMD_V4F32 offsetNoWrap = AKSIMD_SET_V4F32(PI / 2); // cos = sin(x+pi/2)
145  const AKSIMD_V4F32 offsetWrap = AKSIMD_SET_V4F32(PI / 2 - 2 * PI); // Wrap: cos(x) = cos(x - 2 pi)
146  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
147 
148  // (cond1 >= cond2) ? a : b
149  AKSIMD_V4F32 offset = AKSIMD_SEL_GTEZ_V4F32(AKSIMD_SUB_V4F32(x, vHalfPI), offsetWrap, offsetNoWrap);
150  return AKSIMD_SIN_V4F32(AKSIMD_ADD_V4F32(x, offset));
151  }
152 
154  {
155  const AKSIMD_V4F32 vNeg = AKSIMD_SET_V4F32(-1.0f);
156  const AKSIMD_V4F32 vOne = AKSIMD_SET_V4F32(1.0f);
157  const AKSIMD_V4F32 vZero = AKSIMD_SET_V4F32(0.0f);
158  const AKSIMD_V4F32 vK = AKSIMD_SET_V4F32(0.28f);
159  const AKSIMD_V4F32 vKRepro = AKSIMD_SET_V4F32(1.f / 0.28f);
160  const AKSIMD_V4F32 vHalfPI = AKSIMD_SET_V4F32(PI / 2);
161  const AKSIMD_V4F32 vPI = AKSIMD_SET_V4F32(PI);
162  const AKSIMD_V4F32 vEpsilon = AKSIMD_SET_V4F32(1e-20f);
163 
164  //Ensure x is not zero a == 0 ? b : c.
165  x = AKSIMD_VSEL_V4F32(x, vEpsilon, AKSIMD_EQ_V4F32(x, vZero));
166 
167  AKSIMD_V4F32 z = AKSIMD_DIV_V4F32(y, x);
168  AKSIMD_V4F32 absz = AKSIMD_ABS_V4F32(z);
169  AKSIMD_V4COND zcond = AKSIMD_GTEQ_V4F32(vOne, absz);
170 
171  //The approximation is done in 2 segments of the form: offset + z/a*(z*z + b);
172 
173  //if ( fabsf( z ) < 1.0f ) then use .28 for the a coef
174  AKSIMD_V4F32 a = AKSIMD_VSEL_V4F32(vNeg, vK, zcond);
175 
176  //if ( fabsf( z ) < 1.0f ) then use 1 for the b factor, else use 0.28
177  AKSIMD_V4F32 b = AKSIMD_VSEL_V4F32(vK, vKRepro, zcond);
178 
179  AKSIMD_V4F32 atan = AKSIMD_MADD_V4F32(z, z, b);
180  atan = AKSIMD_MUL_V4F32(atan, a);
181  atan = AKSIMD_DIV_V4F32(z, atan);
182 
183  //Adjust for quadrant
184  // zcond x<0 y<0 offset
185  // 1 0 0 0
186  // 1 0 1 0
187  // 1 1 0 +PI
188  // 1 1 1 -PI
189  // 0 0 0 +PI/2
190  // 0 0 1 -PI/2
191  // 0 1 0 +PI/2
192  // 0 1 1 -PI/2
193 
194  AKSIMD_V4F32 offsetByX = AKSIMD_SEL_GTEZ_V4F32(x, vZero, vPI);
195  AKSIMD_V4F32 offset = AKSIMD_VSEL_V4F32(vHalfPI, offsetByX, zcond);
196  AKSIMD_V4F32 sign = AKSIMD_SEL_GTEZ_V4F32(y, vOne, vNeg);
197 
198  //Apply computed offset.
199  atan = AKSIMD_MADD_V4F32(offset, sign, atan);
200  return atan;
201  }
202 
203  //Accepts any positive x. Compare with FastSqrt() which accepts only between ]0,1]
205  {
207  return AKSIMD_GETELEMENT_V4F32(y, 0);
208  }
209 
210  //Compute 1/sqrt(x)
212  {
214  return AKSIMD_GETELEMENT_V4F32(y, 0);
215  }
216 
218  {
220  return AKSIMD_GETELEMENT_V4F32(y, 0);
221  }
222 }
223 
224 #endif //_AKSIMDMATH_H_
#define AKSIMD_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: AkSimd.h:546
AkForceInline AKSIMD_V4F32 DotPoduct3_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:90
AkForceInline AKSIMD_V4F32 DotPoduct4_1x4(const AKSIMD_V4F32 &v0_xyz, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:112
float32x4_t AKSIMD_V4F32
Vector of 4 32-bit floats
Definition: AkSimd.h:72
AkForceInline AKSIMD_V4F32 DotPoduct4_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v0_w, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z, const AKSIMD_V4F32 &v1_w)
Definition: AkSimdMath.h:99
#define AKSIMD_VSEL_V4F32(__a__, __b__, __c__)
Return a when control mask is 0, return b when control mask is non zero, control mask is in c and usu...
Definition: AkSimd.h:947
#define AKSIMD_GTEQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:926
#define AKSIMD_RSQRT_V4F32(__a__)
Vector reciprocal square root approximation 1/sqrt(a), or equivalently, sqrt(1/a)
Definition: AkSimd.h:738
AkForceInline AKSIMD_V4F32 AKSIMD_COS_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:141
uint32x4_t AKSIMD_V4COND
Vector of 4 comparison results
Definition: AkSimd.h:74
AkForceInline void UnpermuteVectors3(const AKSIMD_V4F32 &xxxx, const AKSIMD_V4F32 &yyyy, const AKSIMD_V4F32 &zzzz, AKSIMD_V4F32 &out_v0, AKSIMD_V4F32 &out_v1, AKSIMD_V4F32 &out_v2, AKSIMD_V4F32 &out_v3)
Definition: AkSimdMath.h:43
#define AKSIMD_MADD_V4F32(__a__, __b__, __c__)
Vector multiply-add and multiply-subtract operations (Aarch64 uses the fused-variants directly where ...
Definition: AkSimd.h:694
#define AKSIMD_SHUFFLE_V4F32(a, b, zyxw)
Definition: AkSimd.h:563
AkForceInline AKSIMD_V4F32 DotPoduct3_4x4(const AKSIMD_V4F32 &v0_x, const AKSIMD_V4F32 &v0_y, const AKSIMD_V4F32 &v0_z, const AKSIMD_V4F32 &v1_x, const AKSIMD_V4F32 &v1_y, const AKSIMD_V4F32 &v1_z)
Definition: AkSimdMath.h:83
float AkReal32
32-bit floating point
#define AKSIMD_EQ_V4F32(__a__, __b__)
Compare each float element and return control mask.
Definition: AkSimd.h:941
AkForceInline AkReal32 FastRSqrt(AkReal32 x)
Definition: AkSimdMath.h:211
#define AKSIMD_GETELEMENT_V4F32(__vName, __num__)
Get the element at index num in vector __vName
Definition: AkSimd.h:38
#define AKSIMD_SUB_V4F32(__a__, __b__)
Definition: AkSimd.h:612
AkForceInline AKSIMD_V4F32 AKSIMD_ATAN2_V4F32(AKSIMD_V4F32 y, AKSIMD_V4F32 x)
Definition: AkSimdMath.h:153
#define AKSIMD_MSUB_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:702
#define AKSIMD_SEL_GTEZ_V4F32(__a__, __b__, __c__)
Definition: AkSimd.h:953
#define AKSIMD_SET_V4F32(__scalar__)
Definition: AkSimd.h:108
AkForceInline void PermuteVectors4(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz, AKSIMD_V4F32 &out_wwww)
Definition: AkSimdMath.h:68
AkForceInline void PermuteVectors3(const AKSIMD_V4F32 &v0, const AKSIMD_V4F32 &v1, const AKSIMD_V4F32 &v2, const AKSIMD_V4F32 &v3, AKSIMD_V4F32 &out_xxxx, AKSIMD_V4F32 &out_yyyy, AKSIMD_V4F32 &out_zzzz)
Definition: AkSimdMath.h:29
AkForceInline AKSIMD_V4F32 AKSIMD_DIV_V4F32(AKSIMD_V4F32 a, AKSIMD_V4F32 b)
Rough estimation of division
Definition: AkSimd.h:664
AkForceInline AkReal32 FastSqrtLarge(AkReal32 x)
Definition: AkSimdMath.h:204
#define AKSIMD_RECIP_V4F32(__a__)
Reciprocal of x (1/x)
Definition: AkSimd.h:744
#define AKSIMD_ABS_V4F32(__a__)
Returns absolute value
Definition: AkSimd.h:728
#define AKSIMD_ADD_V4F32(__a__, __b__)
Definition: AkSimd.h:626
AkForceInline AKSIMD_V4F32 AKSIMD_SIN_V4F32(const AKSIMD_V4F32 x)
Definition: AkSimdMath.h:122
AkForceInline AkReal32 FastRcp(AkReal32 x)
Definition: AkSimdMath.h:217
#define AkForceInline
Definition: AkTypes.h:63
#define AKSIMD_SQRT_V4F32(__vec__)
Square root (4 floats)
Definition: AkSimd.h:735
#define AKSIMD_MUL_V4F32(__a__, __b__)
Definition: AkSimd.h:657

このページはお役に立ちましたか?

サポートは必要ですか?

ご質問や問題、ご不明点はございますか?お気軽にお問い合わせください。

サポートページをご確認ください

あなたのプロジェクトについて教えてください。ご不明な点はありませんか。

プロジェクトを登録していただくことで、ご利用開始のサポートをいたします。

Wwiseからはじめよう