nlib
SimdFloat.h
Go to the documentation of this file.
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
5 
6 #ifdef NN_PLATFORM_CTR
7 # ifndef __USE_C99_MATH
8 # define __USE_C99_MATH
9 # endif
10 #endif
11 #include <math.h>
12 #include <float.h>
13 
14 #include "nn/nlib/Config.h"
15 #include "nn/nlib/simd/SimdInt.h"
16 
17 #if !defined(NLIB_SIMD) && !defined(NLIB_CAFE_PPC)
18 #define NLIB_F128_SIMD_NOUSE
19 #endif
20 
21 #ifdef NLIB_F128_SIMD_NOUSE
22 typedef struct {
23  union {
24  float v[4];
25  uint32_t u[4];
26  } vec;
27 } nlib_f128_t;
28 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
29 #elif defined(NLIB_SSE41)
30 typedef __m128 nlib_f128_t;
31 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
32 #elif defined(NLIB_NEON)
33 typedef float32x4_t nlib_f128_t;
34 typedef float32x4x2_t nlib_f128x2_t;
35 #elif defined(NLIB_CAFE_PPC)
36 typedef struct {
37  union {
38  f32x2 ps[2];
39  float v[4];
40  uint32_t u[4];
41  } vec;
42 } nlib_f128_t;
43 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
44 #endif
45 
46 NLIB_NAMESPACE_BEGIN
47 namespace simd {
48 
49 // use each_float for the argument value
50 struct each_float_tag {};
52 
53 // __m128(SSE), float32x4_t(NEON)
54 typedef nlib_f128_t f128;
55 // float32x4x2_t(NEON)
57 
58 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
59 typedef const f128& f128arg;
60 #else
61 typedef const f128 f128arg;
62 #endif
63 
64 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
65 typedef const f128& f128arg_ex;
66 #else
67 typedef const f128 f128arg_ex;
68 #endif
69 
70 #if !defined(_MSC_VER) || _MSC_VER < 1800
71 #ifndef __vectorcall
72 #define __vectorcall
73 #endif
74 #endif
75 
77  public:
78  static f128 __vectorcall SetValue(float v, each_float_tag) NLIB_NOEXCEPT;
79  static f128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
80  static f128 __vectorcall SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT;
81  template <size_t N>
82  static f128 __vectorcall SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT;
83  static f128 __vectorcall SetZero() NLIB_NOEXCEPT;
84  template <size_t N>
85  static f128 __vectorcall SetZeroToLane(f128arg value) NLIB_NOEXCEPT;
86  static f128 __vectorcall SetOne() NLIB_NOEXCEPT;
87  static f128 __vectorcall SetNegativeOne() NLIB_NOEXCEPT;
88  static f128 __vectorcall SetEpsilon() NLIB_NOEXCEPT;
89  static f128 __vectorcall SetInfinity() NLIB_NOEXCEPT;
90  static f128 __vectorcall SetNaN() NLIB_NOEXCEPT;
91  static f128 __vectorcall SetSignMask() NLIB_NOEXCEPT;
92 
93  static f128 __vectorcall LoadA16(const float* p) NLIB_NOEXCEPT;
94  static f128 __vectorcall LoadA8(const float* p) NLIB_NOEXCEPT;
95  static f128 __vectorcall LoadA4(const float* p) NLIB_NOEXCEPT;
96  static f128 __vectorcall LoadA16(uintptr_t p) NLIB_NOEXCEPT;
97  static f128 __vectorcall LoadA8(uintptr_t p) NLIB_NOEXCEPT;
98  static f128 __vectorcall LoadA4(uintptr_t p) NLIB_NOEXCEPT;
99  static f128 __vectorcall LoadA16(intptr_t p) NLIB_NOEXCEPT;
100  static f128 __vectorcall LoadA8(intptr_t p) NLIB_NOEXCEPT;
101  static f128 __vectorcall LoadA4(intptr_t p) NLIB_NOEXCEPT;
102 
103  static void __vectorcall StoreA16(float* p, f128arg value) NLIB_NOEXCEPT;
104  static void __vectorcall StoreA8(float* p, f128arg value) NLIB_NOEXCEPT;
105  static void __vectorcall StoreA4(float* p, f128arg value) NLIB_NOEXCEPT;
106  static void __vectorcall StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
107  static void __vectorcall StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
108  static void __vectorcall StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
109  static void __vectorcall StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT;
110  static void __vectorcall StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
111  static void __vectorcall StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
112 
113  static void __vectorcall StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT;
114  static void __vectorcall StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT;
115  static void __vectorcall StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
116  static void __vectorcall StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
117  static void __vectorcall StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
118  static void __vectorcall StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
119 
120  static void __vectorcall StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT;
121  static void __vectorcall StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT;
122  static void __vectorcall StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
123  static void __vectorcall StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
124  static void __vectorcall StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
125  static void __vectorcall StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
126 
127 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(NLIB_CAFE_PPC)
128  static f128 __vectorcall ConvertFromI128(i128 value) NLIB_NOEXCEPT;
129  static i128 __vectorcall ConvertToI128Round(f128 value) NLIB_NOEXCEPT;
130  static i128 __vectorcall ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT;
131 
132  static f128 __vectorcall CastFromI128(i128 value) NLIB_NOEXCEPT;
133  static i128 __vectorcall CastToI128(f128 value) NLIB_NOEXCEPT;
134 #endif
135 
136  static f128 __vectorcall Add(f128arg a, f128arg b) NLIB_NOEXCEPT;
137  static f128 __vectorcall Sub(f128arg a, f128arg b) NLIB_NOEXCEPT;
138  static f128 __vectorcall Mult(f128arg a, f128arg b) NLIB_NOEXCEPT;
139  static f128 __vectorcall Mult(float a, f128arg b) NLIB_NOEXCEPT;
140  template <size_t N>
141  static f128 __vectorcall Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT;
142  static f128 __vectorcall Div(f128arg a, f128arg b) NLIB_NOEXCEPT;
143  static f128 __vectorcall Negate(f128arg value) NLIB_NOEXCEPT;
144  template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
145  static f128 __vectorcall NegateEx(f128arg value) NLIB_NOEXCEPT;
146  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
147  static f128 __vectorcall MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
148  template <size_t N>
149  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
151  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
152  static f128 __vectorcall MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
153  template <size_t N>
154  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
156  static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT;
157  static f128 __vectorcall Abs(f128arg value) NLIB_NOEXCEPT;
158  static f128 __vectorcall AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT;
159 
160  //
161  // Min/Max
162  //
163 
164  static f128 __vectorcall Max(f128arg a, f128arg b) NLIB_NOEXCEPT;
165  static f128 __vectorcall Min(f128arg a, f128arg b) NLIB_NOEXCEPT;
166  static f128 __vectorcall PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT;
167  static f128 __vectorcall PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT;
168  static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT;
169  static f128 __vectorcall Saturate(f128arg value) NLIB_NOEXCEPT;
170 
171  //
172  // Reciprocal/Sqrt
173  //
174 
175  static f128 __vectorcall Recp(f128arg value) NLIB_NOEXCEPT;
176  static f128 __vectorcall RecpEst(f128arg value) NLIB_NOEXCEPT;
177  static f128 __vectorcall Sqrt(f128arg value) NLIB_NOEXCEPT;
178  static f128 __vectorcall SqrtEst(f128arg value) NLIB_NOEXCEPT;
179  static f128 __vectorcall RecpSqrt(f128arg value) NLIB_NOEXCEPT;
180  static f128 __vectorcall RecpSqrtEst(f128arg value) NLIB_NOEXCEPT;
181 
182  //
183  // Round/Truncate
184  //
185 
186  static f128 __vectorcall Round(f128arg value) NLIB_NOEXCEPT;
187  static f128 __vectorcall Truncate(f128arg value) NLIB_NOEXCEPT;
188  static f128 __vectorcall Floor(f128arg value) NLIB_NOEXCEPT;
189  static f128 __vectorcall Ceil(f128arg value) NLIB_NOEXCEPT;
190 
191  //
192  // Logical Operations
193  //
194 
195  static f128 __vectorcall And(f128arg a, f128arg b) NLIB_NOEXCEPT;
196  static f128 __vectorcall Or(f128arg a, f128arg b) NLIB_NOEXCEPT;
197  static f128 __vectorcall Xor(f128arg a, f128arg b) NLIB_NOEXCEPT;
198  static f128 __vectorcall Not(f128arg a) NLIB_NOEXCEPT;
199  static f128 __vectorcall AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
200  static f128 __vectorcall OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
201 
202  //
203  // Comparison Operations
204  //
205 
206  static f128 __vectorcall CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT;
207  static f128 __vectorcall CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT;
208  static f128 __vectorcall CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT;
209  static f128 __vectorcall CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT;
210  static f128 __vectorcall CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT;
211  static f128 __vectorcall CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT;
212  static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT;
213  static f128 __vectorcall InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT;
214 
215  //
216  // Trigonometric function
217  //
218  static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
219  static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
220  static f128 __vectorcall ModAngle(f128arg value) NLIB_NOEXCEPT;
221  static f128 __vectorcall Sin(f128arg value) NLIB_NOEXCEPT;
222  static f128 __vectorcall Cos(f128arg value) NLIB_NOEXCEPT;
223  static f128x2 __vectorcall SinCos(f128arg value) NLIB_NOEXCEPT;
224  static f128 __vectorcall Tan(f128arg value) NLIB_NOEXCEPT;
225  static f128 __vectorcall SinH(f128arg value) NLIB_NOEXCEPT;
226  static f128 __vectorcall CosH(f128arg value) NLIB_NOEXCEPT;
227  static f128 __vectorcall TanH(f128arg value) NLIB_NOEXCEPT;
228  static f128 __vectorcall ArcSin(f128arg value) NLIB_NOEXCEPT;
229  static f128 __vectorcall ArcCos(f128arg value) NLIB_NOEXCEPT;
230  static f128 __vectorcall ArcTan(f128arg value) NLIB_NOEXCEPT;
231  static f128 __vectorcall ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT;
232  // and Est version needed?
233 
234  //
235  // Interpolation
236  //
237 
238  static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT;
239  static f128 __vectorcall
240  Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t) NLIB_NOEXCEPT;
241  static f128 __vectorcall
242  CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t) NLIB_NOEXCEPT;
243  static f128 __vectorcall
244  BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g) NLIB_NOEXCEPT;
245 
246  //
247  // Exp/Log
248  //
249  static f128 __vectorcall Exp2(f128arg value) NLIB_NOEXCEPT;
250  static f128 __vectorcall ExpE(f128arg value) NLIB_NOEXCEPT;
251  static f128 __vectorcall Log2(f128arg value) NLIB_NOEXCEPT;
252  static f128 __vectorcall LogE(f128arg value) NLIB_NOEXCEPT; // not implemented
253 
254  //
255  // Misc
256  //
257 
258  static int __vectorcall MoveMask(f128arg value) NLIB_NOEXCEPT;
259  static bool __vectorcall IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT;
260  static bool __vectorcall IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT;
261  static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT;
262  static f128 __vectorcall IsNaN(f128arg value) NLIB_NOEXCEPT;
263  static f128 __vectorcall IsInfinite(f128arg value) NLIB_NOEXCEPT;
264 
265  //
266  // Get/Set
267  //
268 
269  template <size_t N>
270  static float __vectorcall GetFloatFromLane(f128arg value) NLIB_NOEXCEPT;
271  template <size_t N>
272  static uint32_t __vectorcall GetUint32FromLane(f128arg value) NLIB_NOEXCEPT;
273  static float __vectorcall GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
274  static uint32_t __vectorcall GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
275 
276  template <size_t N>
277  static f128 __vectorcall SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT;
278  static f128 __vectorcall SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT;
279 
280  //
281  // Swizzle/Permute
282  //
283 
284  template <size_t V0, size_t V1, size_t V2, size_t V3>
285  static f128 __vectorcall Swizzle(f128arg value) NLIB_NOEXCEPT;
286  template <size_t V0, size_t V1, size_t V2, size_t V3>
287  static f128 __vectorcall Permute(f128arg a, f128arg b) NLIB_NOEXCEPT;
288  template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
289  static f128 __vectorcall Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT;
290 
291  template <size_t N>
292  // left <- value[3], value[2], value[1], value[0] -> right
293  static f128 __vectorcall RotateLeft(f128arg value) NLIB_NOEXCEPT {
294  NLIB_STATIC_ASSERT(N < 4);
295  const size_t NN = 4 - N;
296  return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
297  }
298  template <size_t N>
299  // left <- value[3], value[2], value[1], value[0] -> right
300  static f128 __vectorcall RotateRight(f128arg value) NLIB_NOEXCEPT {
301  NLIB_STATIC_ASSERT(N < 4);
302  return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
303  }
304  template <size_t N>
305  // left <- b[3], ..., b[0], a[3], ..., a[0] -> right
306  static f128 __vectorcall ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
307  NLIB_STATIC_ASSERT(N < 4);
308  return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
309  }
310 
311  private:
312  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v1000_[4]; // 1.f, 0.f, 0.f, 0.f
313  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0100_[4]; // 0.f, 1.f, 0.f, 0.f
314  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0010_[4]; // 0.f, 0.f, 1.f, 0.f
315  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0001_[4]; // 0.f, 0.f, 0.f, 1.f
316 
317  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float pi_values_[4]; // pi, -pi, 2pi, pi/2
318  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const uint32_t nan_inf_[4]; // nan, inf, -nan, -inf
319 
320  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R0_[4];
321  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R1_[4];
322  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R2_[4];
323  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R3_[4];
324 
325  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R0_[4];
326  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R1_[4];
327  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R2_[4];
328  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R3_[4];
329 
330  // Those coefficients are from Cephes Math Library
331  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_coeff_[6];
332  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_cvalue_[4];
333  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_coeff_[6];
334  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan_coeff_[8];
335  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan2_cvalue_[4];
336 
337  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_P_[4];
338  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_Q_[4];
339  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tanh_cvalue_[4];
340  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_p_[4];
341  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_q_[4];
342  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_c_[4];
343  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float log2_PQ_[12];
344 
345  F128(); // forbidden
346  friend class Vector3;
347  friend class Vector4;
348  friend class Matrix;
349  friend class Plane;
350  friend class Quaternion;
351  friend class Sphere;
352  friend class AxisAlignedBox;
353  friend class OrientedBox;
354  friend class Frustum;
355 
356  friend class DistanceSq;
357  friend class Intersection;
358  friend class Containment;
359 };
360 
361 #ifndef NLIB_DOXYGEN
362 
363 #undef NLIB_M
364 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
365 #define NLIB_M2(tp) inline tp __vectorcall
366 
367 // r[i] = v
368 NLIB_M(f128) F128::SetValue(float v, each_float_tag) NLIB_NOEXCEPT {
369 #ifdef NLIB_F128_SIMD_NOUSE
370  f128 ret;
371  ret.vec.v[0] = v;
372  ret.vec.v[1] = v;
373  ret.vec.v[2] = v;
374  ret.vec.v[3] = v;
375  return ret;
376 #elif defined(NLIB_SSE41)
377  return _mm_set1_ps(v);
378 #elif defined(NLIB_NEON)
379  return vdupq_n_f32(v);
380 #elif defined(NLIB_CAFE_PPC)
381  f128 ret;
382  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
383  return ret;
384 #endif
385 }
386 
387 // r[i] = *reinterpret_cast<float*>(&v)
388 NLIB_M(f128) F128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
389 #ifdef NLIB_F128_SIMD_NOUSE
390  f128 ret;
391  ret.vec.u[0] = v;
392  ret.vec.u[1] = v;
393  ret.vec.u[2] = v;
394  ret.vec.u[3] = v;
395  return ret;
396 #elif defined(NLIB_SSE41)
397  union {
398  float f32;
399  uint32_t u32;
400  } tmp;
401  tmp.u32 = v;
402  return _mm_set1_ps(tmp.f32);
403 #elif defined(NLIB_NEON)
404  uint32x4_t tmp = vdupq_n_u32(v);
405  return vreinterpretq_f32_u32(tmp);
406 #elif defined(NLIB_CAFE_PPC)
407  union {
408  float f32;
409  uint32_t u32;
410  } tmp;
411  tmp.u32 = v;
412  f128 ret;
413  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
414  return ret;
415 #endif
416 }
417 
418 // r[0] = a, r[1] = b, r[2] = c, r[3] = d
419 NLIB_M(f128) F128::SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT {
420 #ifdef NLIB_F128_SIMD_NOUSE
421  f128 ret;
422  ret.vec.v[0] = a;
423  ret.vec.v[1] = b;
424  ret.vec.v[2] = c;
425  ret.vec.v[3] = d;
426  return ret;
427 #elif defined(NLIB_SSE41)
428  return _mm_set_ps(d, c, b, a);
429 #elif defined(NLIB_NEON)
430  union {
431  float f32[2];
432  uint64_t u64;
433  } tmp1, tmp2;
434  tmp1.f32[0] = a;
435  tmp1.f32[1] = b;
436  tmp2.f32[0] = c;
437  tmp2.f32[1] = d;
438  return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
439 #elif defined(NLIB_CAFE_PPC)
440  f128 ret;
441  ret.vec.ps[0][0] = a;
442  ret.vec.ps[0][1] = b;
443  ret.vec.ps[1][0] = c;
444  ret.vec.ps[1][1] = d;
445  return ret;
446 #endif
447 }
448 
449 template <size_t N>
450 // r[i] = value[N]
451 NLIB_M(f128) F128::SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
452  NLIB_STATIC_ASSERT(N < 4);
453 #ifdef NLIB_F128_SIMD_NOUSE
454  f128 ret;
455  ret.vec.v[0] = value.vec.v[N];
456  ret.vec.v[1] = value.vec.v[N];
457  ret.vec.v[2] = value.vec.v[N];
458  ret.vec.v[3] = value.vec.v[N];
459  return ret;
460 #elif defined(NLIB_SSE41)
461  return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
462 #elif defined(NLIB_NEON)
463  float32x2_t tmp = vget_low_f32(value);
464  return vdupq_lane_f32(tmp, N);
465 #endif
466 }
467 
468 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
469 template <>
470 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
471  float32x2_t tmp = vget_high_f32(value);
472  return vdupq_lane_f32(tmp, 0);
473 }
474 template <>
475 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
476  float32x2_t tmp = vget_high_f32(value);
477  return vdupq_lane_f32(tmp, 1);
478 }
479 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
480 template <>
481 NLIB_M(f128) F128::SetValue<0>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
482  f128 ret;
483  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
484  return ret;
485 }
486 template <>
487 NLIB_M(f128) F128::SetValue<1>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
488  f128 ret;
489  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
490  return ret;
491 }
492 template <>
493 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
494  f128 ret;
495  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
496  return ret;
497 }
498 template <>
499 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
500  f128 ret;
501  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
502  return ret;
503 }
504 #endif
505 
506 // r[i] = 0.f
507 NLIB_M(f128) F128::SetZero() NLIB_NOEXCEPT {
508 #ifdef NLIB_F128_SIMD_NOUSE
509  f128 ret;
510  ret.vec.v[0] = 0;
511  ret.vec.v[1] = 0;
512  ret.vec.v[2] = 0;
513  ret.vec.v[3] = 0;
514  return ret;
515 #elif defined(NLIB_SSE41)
516  return _mm_setzero_ps();
517 #elif defined(NLIB_NEON)
518  return vdupq_n_f32(0);
519 #elif defined(NLIB_CAFE_PPC)
520  f128 ret;
521  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
522  return ret;
523 #endif
524 }
525 
526 template <size_t N>
527 // r = value, r[N] = 0.f
528 NLIB_M(f128) F128::SetZeroToLane(f128arg value) NLIB_NOEXCEPT {
529  NLIB_STATIC_ASSERT(N < 4);
530 #ifdef NLIB_F128_SIMD_NOUSE
531  f128 ret = value;
532  ret.vec.v[N] = 0.f;
533  return ret;
534 #elif defined(NLIB_SSE41)
535  return _mm_insert_ps(value, value, 1 << N);
536 #elif defined(NLIB_NEON)
537  return vsetq_lane_f32(0.f, value, N);
538 #elif defined(NLIB_CAFE_PPC)
539  f128 ret = value;
540  ret.vec.ps[N / 2][N % 2] = 0.f;
541  return ret;
542 #endif
543 }
544 
545 // r[i] = 1.f
546 NLIB_M(f128) F128::SetOne() NLIB_NOEXCEPT {
547  return F128::SetValue(1.f, each_float);
548 }
549 
550 // r[i] = -1.f
551 NLIB_M(f128) F128::SetNegativeOne() NLIB_NOEXCEPT {
552  return F128::SetValue(-1.f, each_float);
553 }
554 
555 // r[i] = 1.0e-7f
556 NLIB_M(f128) F128::SetEpsilon() NLIB_NOEXCEPT {
557  return F128::SetValue(1.0e-7f, each_float);
558 }
559 
560 // r[i] = 0x7F800000U
561 NLIB_M(f128) F128::SetInfinity() NLIB_NOEXCEPT {
562  return F128::SetValue(0x7F800000U, each_uint32);
563 }
564 
565 // r[i] = 0x7FC00000U
566 NLIB_M(f128) F128::SetNaN() NLIB_NOEXCEPT {
567  return F128::SetValue(0x7FC00000U, each_uint32);
568 }
569 
570 // r[i] = -0.f(0x80000000U)
571 NLIB_M(f128) F128::SetSignMask() NLIB_NOEXCEPT {
572  return F128::SetValue(-0.f, each_float);
573 }
574 
575 // r[i] = p[i], p is 16 bytes aligned
576 NLIB_M(f128) F128::LoadA16(const float* p) NLIB_NOEXCEPT {
577 #ifdef NLIB_F128_SIMD_NOUSE
578  f128 ret;
579  ret.vec.v[0] = p[0];
580  ret.vec.v[1] = p[1];
581  ret.vec.v[2] = p[2];
582  ret.vec.v[3] = p[3];
583  return ret;
584 #elif defined(NLIB_SSE41)
585  return _mm_load_ps(p);
586 #elif defined(NLIB_NEON)
587  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
588  uint64x2_t val = vld1q_u64(tmp);
589  return vreinterpretq_f32_u64(val);
590 #elif defined(NLIB_CAFE_PPC)
591  f128 ret;
592  ret.vec.ps[0][0] = p[0];
593  ret.vec.ps[0][1] = p[1];
594  ret.vec.ps[1][0] = p[2];
595  ret.vec.ps[1][1] = p[3];
596  return ret;
597 #endif
598 }
599 
600 // r[i] = p[i], p is 4 bytes aligned
601 NLIB_M(f128) F128::LoadA4(const float* p) NLIB_NOEXCEPT {
602 #ifdef NLIB_F128_SIMD_NOUSE
603  return LoadA16(p);
604 #elif defined(NLIB_SSE41)
605  return _mm_loadu_ps(p);
606 #elif defined(NLIB_NEON)
607  return vld1q_f32(p);
608 #elif defined(NLIB_CAFE_PPC)
609  f128 ret;
610  ret.vec.ps[0][0] = p[0];
611  ret.vec.ps[0][1] = p[1];
612  ret.vec.ps[1][0] = p[2];
613  ret.vec.ps[1][1] = p[3];
614  return ret;
615 #endif
616 }
617 
618 // r[i] = p[i], p is 8 bytes aligned
619 NLIB_M(f128) F128::LoadA8(const float* p) NLIB_NOEXCEPT {
620 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
621  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
622  uint64x2_t val = vld1q_u64(tmp);
623  return vreinterpretq_f32_u64(val);
624 #else
625  return LoadA4(p);
626 #endif
627 }
628 
629 // r[i] = p[i], p is 16 bytes aligned
630 NLIB_M(f128) F128::LoadA16(uintptr_t p) NLIB_NOEXCEPT {
631  return LoadA16(reinterpret_cast<const float*>(p));
632 }
633 
634 // r[i] = p[i], p is 8 bytes aligned
635 NLIB_M(f128) F128::LoadA8(uintptr_t p) NLIB_NOEXCEPT {
636  return LoadA8(reinterpret_cast<const float*>(p));
637 }
638 
639 // r[i] = p[i], p is 4 bytes aligned
640 NLIB_M(f128) F128::LoadA4(uintptr_t p) NLIB_NOEXCEPT {
641  return LoadA4(reinterpret_cast<const float*>(p));
642 }
643 
644 // r[i] = p[i], p is 16 bytes aligned
645 NLIB_M(f128) F128::LoadA16(intptr_t p) NLIB_NOEXCEPT {
646  return LoadA16(reinterpret_cast<const float*>(p));
647 }
648 
649 // r[i] = p[i], p is 8 bytes aligned
650 NLIB_M(f128) F128::LoadA8(intptr_t p) NLIB_NOEXCEPT {
651  return LoadA8(reinterpret_cast<const float*>(p));
652 }
653 
654 // r[i] = p[i], p is 4 bytes aligned
655 NLIB_M(f128) F128::LoadA4(intptr_t p) NLIB_NOEXCEPT {
656  return LoadA4(reinterpret_cast<const float*>(p));
657 }
658 
659 // p[i] = value[i], p is 16 bytes aligned
660 NLIB_M(void) F128::StoreA16(float* p, f128arg value) NLIB_NOEXCEPT {
661 #ifdef NLIB_F128_SIMD_NOUSE
662  p[0] = value.vec.v[0];
663  p[1] = value.vec.v[1];
664  p[2] = value.vec.v[2];
665  p[3] = value.vec.v[3];
666 #elif defined(NLIB_SSE41)
667  _mm_store_ps(p, value);
668 #elif defined(NLIB_NEON)
669  uint64x2_t tmp = vreinterpretq_u64_f32(value);
670  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
671 #elif defined(NLIB_CAFE_PPC)
672  p[0] = value.vec.ps[0][0];
673  p[1] = value.vec.ps[0][1];
674  p[2] = value.vec.ps[1][0];
675  p[3] = value.vec.ps[1][1];
676 #endif
677 }
678 
679 // p[i] = value[i], p is 4 bytes aligned
680 NLIB_M(void) F128::StoreA4(float* p, f128arg value) NLIB_NOEXCEPT {
681 #ifdef NLIB_F128_SIMD_NOUSE
682  StoreA16(p, value);
683 #elif defined(NLIB_SSE41)
684  _mm_storeu_ps(p, value);
685 #elif defined(NLIB_NEON)
686  vst1q_f32(p, value);
687 #elif defined(NLIB_CAFE_PPC)
688  p[0] = value.vec.ps[0][0];
689  p[1] = value.vec.ps[0][1];
690  p[2] = value.vec.ps[1][0];
691  p[3] = value.vec.ps[1][1];
692 #endif
693 }
694 
695 // p[i] = value[i], p is 8 bytes aligned
696 NLIB_M(void) F128::StoreA8(float* p, f128arg value) NLIB_NOEXCEPT {
697 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
698  uint64x2_t tmp = vreinterpretq_u64_f32(value);
699  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
700 #else
701  StoreA4(p, value);
702 #endif
703 }
704 
705 // p[i] = value[i], p is 16 bytes aligned
706 NLIB_M(void) F128::StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
707  StoreA16(reinterpret_cast<float*>(p), value);
708 }
709 
710 // p[i] = value[i], p is 8 bytes aligned
711 NLIB_M(void) F128::StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
712  StoreA8(reinterpret_cast<float*>(p), value);
713 }
714 
715 // p[i] = value[i], p is 4 bytes aligned
716 NLIB_M(void) F128::StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
717  StoreA4(reinterpret_cast<float*>(p), value);
718 }
719 
720 // p[i] = value[i], p is 16 bytes aligned
721 NLIB_M(void) F128::StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT {
722  StoreA16(reinterpret_cast<float*>(p), value);
723 }
724 
725 // p[i] = value[i], p is 8 bytes aligned
726 NLIB_M(void) F128::StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
727  StoreA8(reinterpret_cast<float*>(p), value);
728 }
729 
730 // p[i] = value[i], p is 4 bytes aligned
731 NLIB_M(void) F128::StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
732  StoreA4(reinterpret_cast<float*>(p), value);
733 }
734 
735 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
736 NLIB_M(void) F128::StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT {
737 #ifdef NLIB_F128_SIMD_NOUSE
738  p[0] = value.vec.v[0];
739  p[1] = value.vec.v[1];
740 #elif defined(NLIB_SSE41)
741  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
742 #elif defined(NLIB_NEON)
743  uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
744  vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
745 #elif defined(NLIB_CAFE_PPC)
746  p[0] = value.vec.ps[0][0];
747  p[1] = value.vec.ps[0][1];
748 #endif
749 }
750 
751 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
752 NLIB_M(void) F128::StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT { StoreLoA8(p, value); }
753 
754 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
755 NLIB_M(void) F128::StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
756  StoreLoA8(reinterpret_cast<float*>(p), value);
757 }
758 
759 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
760 NLIB_M(void) F128::StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
761  StoreLoA4(reinterpret_cast<float*>(p), value);
762 }
763 
764 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
765 NLIB_M(void) F128::StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
766  StoreLoA8(reinterpret_cast<float*>(p), value);
767 }
768 
769 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
770 NLIB_M(void) F128::StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
771  StoreLoA4(reinterpret_cast<float*>(p), value);
772 }
773 
774 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
775 NLIB_M(void) F128::StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT {
776 #ifdef NLIB_F128_SIMD_NOUSE
777  p[0] = value.vec.v[2];
778  p[1] = value.vec.v[3];
779 #elif defined(NLIB_SSE41)
780  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
781 #elif defined(NLIB_NEON)
782  vst1_f32(p, vget_high_f32(value));
783 #elif defined(NLIB_CAFE_PPC)
784  p[0] = value.vec.ps[1][0];
785  p[1] = value.vec.ps[1][1];
786 #endif
787 }
788 
789 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
790 NLIB_M(void) F128::StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT { StoreHiA8(p, value); }
791 
792 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
793 NLIB_M(void) F128::StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
794  StoreHiA8(reinterpret_cast<float*>(p), value);
795 }
796 
797 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
798 NLIB_M(void) F128::StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
799  StoreHiA4(reinterpret_cast<float*>(p), value);
800 }
801 
802 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
803 NLIB_M(void) F128::StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
804  StoreHiA8(reinterpret_cast<float*>(p), value);
805 }
806 
807 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
808 NLIB_M(void) F128::StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
809  StoreHiA4(reinterpret_cast<float*>(p), value);
810 }
811 
812 // r[i] = fabs(value[i])
813 NLIB_M(f128) F128::Abs(f128arg value) NLIB_NOEXCEPT {
814 #ifdef NLIB_F128_SIMD_NOUSE
815  f128 ret;
816  ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
817  ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
818  ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
819  ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
820  return ret;
821 #elif defined(NLIB_NEON)
822  return vabsq_f32(value);
823 #elif defined(NLIB_SSE41)
824  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
825  return _mm_andnot_ps(signmask, value);
826 #elif defined(NLIB_CAFE_PPC)
827  f128 ret;
828  ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
829  ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
830  return ret;
831 #endif
832 }
833 
834 // r[i] = mask[i] ? a[i] : b[i]
835 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT {
836 #ifdef NLIB_F128_SIMD_NOUSE
837  f128 result;
838  result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
839  result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
840  result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
841  result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
842  return result;
843 #elif defined(NLIB_SSE41)
844  return _mm_blendv_ps(b, a, mask);
845 #elif defined(NLIB_NEON)
846  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
847 #elif defined(NLIB_CAFE_PPC)
848  // avoid NaN
849  f128 mask_ = mask;
850  mask_.vec.u[0] &= 0xFF7FFFFFUL;
851  mask_.vec.u[1] &= 0xFF7FFFFFUL;
852  mask_.vec.u[2] &= 0xFF7FFFFFUL;
853  mask_.vec.u[3] &= 0xFF7FFFFFUL;
854  // mask_ < 0 ? a : b
855  f128 ret;
856  ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
857  ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
858  return ret;
859 #endif
860 }
861 
862 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(NLIB_CAFE_PPC)
863 // r[i] = static_cast<float>(value[i])
864 NLIB_M(f128) F128::ConvertFromI128(i128 value) NLIB_NOEXCEPT {
865 #if defined(NLIB_SSE41)
866  return _mm_cvtepi32_ps(value);
867 #elif defined(NLIB_NEON)
868  return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
869 #endif
870 }
871 
872 // r[i] = *reinterpret_cast<float*>(&value[i])
873 NLIB_M(f128) F128::CastFromI128(i128 value) NLIB_NOEXCEPT {
874 #if defined(NLIB_SSE41)
875  return _mm_castsi128_ps(value);
876 #elif defined(NLIB_NEON)
877  return vreinterpretq_f32_s8(value);
878 #endif
879 }
880 
881 // r[i] = static_cast<int>(roundf(value[i]))
882 NLIB_M(i128) F128::ConvertToI128Round(f128 value) NLIB_NOEXCEPT {
883 #if defined(NLIB_SSE41)
884  return _mm_cvtps_epi32(value);
885 #elif defined(NLIB_NEON)
886  uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
887  uint32x4_t sgn = vdupq_n_u32(0x80000000U);
888  uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
889  w = vorrq_u32(w, half);
890  return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
891 #endif
892 }
893 
894 NLIB_M(i128) F128::ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT {
895 #if defined(NLIB_SSE41)
896  return _mm_cvttps_epi32(value);
897 #elif defined(NLIB_NEON)
898  return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
899 #endif
900 }
901 
902 // r[i] = *reinterpret_cast<int*>(&value[i])
903 NLIB_M(i128) F128::CastToI128(f128 value) NLIB_NOEXCEPT {
904 #if defined(NLIB_SSE41)
905  return _mm_castps_si128(value);
906 #elif defined(NLIB_NEON)
907  return vreinterpretq_s8_f32(value);
908 #endif
909 }
910 #endif
911 
912 // r[i] = (a[i] < b[i]) ? 0xFFFFFFFF : 0
913 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT {
914 #ifdef NLIB_F128_SIMD_NOUSE
915  f128 ret;
916  ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
917  ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
918  ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
919  ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
920  return ret;
921 #elif defined(NLIB_SSE41)
922  return _mm_cmplt_ps(a, b);
923 #elif defined(NLIB_NEON)
924  uint32x4_t tmp = vcltq_f32(a, b);
925  return vreinterpretq_f32_u32(tmp);
926 #elif defined(NLIB_CAFE_PPC)
927  // CAUTION:
928  // The result may not be right if a or b is NaN
929  f128 ret;
930  ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
931  ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
932  return ret;
933 #endif
934 }
935 
936 // r[i] = (a[i] <= b[i]) ? 0xFFFFFFFF : 0
937 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT {
938 #ifdef NLIB_F128_SIMD_NOUSE
939  f128 ret;
940  ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
941  ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
942  ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
943  ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
944  return ret;
945 #elif defined(NLIB_SSE41)
946  return _mm_cmple_ps(a, b);
947 #elif defined(NLIB_NEON)
948  uint32x4_t tmp = vcleq_f32(a, b);
949  return vreinterpretq_f32_u32(tmp);
950 #elif defined(NLIB_CAFE_PPC)
951  // CAUTION:
952  // The result may not be right if a or b is NaN
953  f128 ret;
954  f32x2 one = __PS_FDUP(1.f);
955  f32x2 minus_one = __PS_NEG(one);
956  f32x2 x0, x1;
957  x0 = __PS_SUB(b.vec.ps[0], a.vec.ps[0]);
958  x1 = __PS_SUB(b.vec.ps[1], a.vec.ps[1]);
959  ret.vec.ps[0] = __PS_SEL(x0, minus_one, one);
960  ret.vec.ps[1] = __PS_SEL(x1, minus_one, one);
961  return ret;
962 #endif
963 }
964 
965 // r[i] = (a[i] > b[i]) ? 0xFFFFFFFF : 0
966 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT {
967 #ifdef NLIB_F128_SIMD_NOUSE
968  f128 ret;
969  ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
970  ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
971  ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
972  ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
973  return ret;
974 #elif defined(NLIB_SSE41)
975  return _mm_cmpgt_ps(a, b);
976 #elif defined(NLIB_NEON)
977  uint32x4_t tmp = vcgtq_f32(a, b);
978  return vreinterpretq_f32_u32(tmp);
979 #elif defined(NLIB_CAFE_PPC)
980  // CAUTION:
981  // The result may not be right if a or b is NaN
982  f128 ret;
983  ret.vec.ps[0] = __PS_SUB(b.vec.ps[0], a.vec.ps[0]);
984  ret.vec.ps[1] = __PS_SUB(b.vec.ps[1], a.vec.ps[1]);
985  return ret;
986 #endif
987 }
988 
989 // r[i] = (a[i] >= b[i]) ? 0xFFFFFFFF : 0
990 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT {
991 #ifdef NLIB_F128_SIMD_NOUSE
992  f128 ret;
993  ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
994  ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
995  ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
996  ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
997  return ret;
998 #elif defined(NLIB_SSE41)
999  return _mm_cmpge_ps(a, b);
1000 #elif defined(NLIB_NEON)
1001  uint32x4_t tmp = vcgeq_f32(a, b);
1002  return vreinterpretq_f32_u32(tmp);
1003 #elif defined(NLIB_CAFE_PPC)
1004  // CAUTION:
1005  // The result may not be right if a or b is NaN
1006  f128 ret;
1007  f32x2 one = __PS_FDUP(1.f);
1008  f32x2 minus_one = __PS_NEG(one);
1009  f32x2 x0, x1;
1010  x0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1011  x1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1012  ret.vec.ps[0] = __PS_SEL(x0, minus_one, one);
1013  ret.vec.ps[1] = __PS_SEL(x1, minus_one, one);
1014  return ret;
1015 #endif
1016 }
1017 
1018 // r[i] = (a[i] != b[i]) ? 0xFFFFFFFF : 0
1019 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1020 #ifdef NLIB_F128_SIMD_NOUSE
1021  f128 ret;
1022  ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1023  ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1024  ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1025  ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1026  return ret;
1027 #elif defined(NLIB_SSE41)
1028  return _mm_cmpneq_ps(a, b);
1029 #elif defined(NLIB_NEON)
1030  uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1031  return vreinterpretq_f32_u32(tmp);
1032 #elif defined(NLIB_CAFE_PPC)
1033  // CAUTION:
1034  // The result may not be right if a or b is NaN
1035  f128 ret;
1036  // DO NOT USE __PS_NEG()
1037  // avoid 0.f * -0.f => -0.f
1038  ret.vec.ps[0] = __PS_MUL(__PS_SUB(a.vec.ps[0], b.vec.ps[0]),
1039  __PS_SUB(b.vec.ps[0], a.vec.ps[0]));
1040  ret.vec.ps[1] = __PS_MUL(__PS_SUB(a.vec.ps[1], b.vec.ps[1]),
1041  __PS_SUB(b.vec.ps[1], a.vec.ps[1]));
1042  return ret;
1043 #endif
1044 }
1045 
1046 // r[i] = a[i] + b[i]
1047 NLIB_M(f128) F128::Add(f128arg a, f128arg b) NLIB_NOEXCEPT {
1048 #ifdef NLIB_F128_SIMD_NOUSE
1049  f128 ret;
1050  ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1051  ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1052  ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1053  ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1054  return ret;
1055 #elif defined(NLIB_SSE41)
1056  return _mm_add_ps(a, b);
1057 #elif defined(NLIB_NEON)
1058  return vaddq_f32(a, b);
1059 #elif defined(NLIB_CAFE_PPC)
1060  f128 ret;
1061  ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1062  ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1063  return ret;
1064 #endif
1065 }
1066 
1067 // r[i] = a[i] - b[i]
1068 NLIB_M(f128) F128::Sub(f128arg a, f128arg b) NLIB_NOEXCEPT {
1069 #ifdef NLIB_F128_SIMD_NOUSE
1070  f128 ret;
1071  ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1072  ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1073  ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1074  ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1075  return ret;
1076 #elif defined(NLIB_SSE41)
1077  return _mm_sub_ps(a, b);
1078 #elif defined(NLIB_NEON)
1079  return vsubq_f32(a, b);
1080 #elif defined(NLIB_CAFE_PPC)
1081  f128 ret;
1082  ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1083  ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1084  return ret;
1085 #endif
1086 }
1087 
1088 // r[i] = -value[i]
1089 NLIB_M(f128) F128::Negate(f128arg value) NLIB_NOEXCEPT {
1090 #ifdef NLIB_F128_SIMD_NOUSE
1091  f128 ret;
1092  ret.vec.v[0] = -value.vec.v[0];
1093  ret.vec.v[1] = -value.vec.v[1];
1094  ret.vec.v[2] = -value.vec.v[2];
1095  ret.vec.v[3] = -value.vec.v[3];
1096  return ret;
1097 #elif defined(NLIB_NEON)
1098  return vnegq_f32(value);
1099 #elif defined(NLIB_SSE41)
1100  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
1101  return _mm_xor_ps(signmask, value);
1102 #elif defined(NLIB_CAFE_PPC)
1103  f128 ret;
1104  ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1105  ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1106  return ret;
1107 #endif
1108 }
1109 
1110 // r[i] = a[i] * b[i]
1111 NLIB_M(f128) F128::Mult(f128arg a, f128arg b) NLIB_NOEXCEPT {
1112 #ifdef NLIB_F128_SIMD_NOUSE
1113  f128 ret;
1114  ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1115  ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1116  ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1117  ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1118  return ret;
1119 #elif defined(NLIB_SSE41)
1120  return _mm_mul_ps(a, b);
1121 #elif defined(NLIB_NEON)
1122  return vmulq_f32(a, b);
1123 #elif defined(NLIB_CAFE_PPC)
1124  f128 ret;
1125  ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1126  ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1127  return ret;
1128 #endif
1129 }
1130 
1131 // r[i] = a * b[i]
1132 NLIB_M(f128) F128::Mult(float a, f128arg b) NLIB_NOEXCEPT {
1133 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1134  return vmulq_n_f32(b, a);
1135 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1136  f128 ret;
1137  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1138  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1139  return ret;
1140 #else
1141  return F128::Mult(b, F128::SetValue(a, each_float));
1142 #endif
1143 }
1144 
1145 template <size_t N>
1146 // r[i] = a[N] * b[i]
1147 NLIB_M(f128) F128::Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT {
1148 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1149 # if __aarch64__
1150  return vmulq_laneq_f32(b, a, N);
1151 # else
1152  float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1153  return vmulq_n_f32(b, tmp);
1154 # endif
1155 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1156  float t = a.vec.ps[N / 2][N % 2];
1157  f128 ret;
1158  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1159  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1160  return ret;
1161 #else
1162  return F128::Mult(F128::SetValue<N>(a, each_select32), b);
1163 #endif
1164 }
1165 
1166 // r[i] = a[i] / b[i]
1167 NLIB_M(f128) F128::Div(f128arg a, f128arg b) NLIB_NOEXCEPT {
1168 #ifdef NLIB_F128_SIMD_NOUSE
1169  f128 ret;
1170  ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1171  ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1172  ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1173  ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1174  return ret;
1175 #elif defined(NLIB_SSE41)
1176  return _mm_div_ps(a, b);
1177 #elif defined(NLIB_NEON)
1178  float32x4_t inv0 = vrecpeq_f32(b);
1179  float32x4_t step0 = vrecpsq_f32(inv0, b);
1180  float32x4_t inv1 = vmulq_f32(step0, inv0);
1181  float32x4_t step1 = vrecpsq_f32(inv1, b);
1182  float32x4_t inv2 = vmulq_f32(step1, inv1);
1183  uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1184  inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1185  return vmulq_f32(a, inv2);
1186 #elif defined(NLIB_CAFE_PPC)
1187  f128 ret;
1188  ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1189  ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1190  return ret;
1191 #endif
1192 }
1193 
1194 // r[i] = max(a[i], b[i])
1195 NLIB_M(f128) F128::Max(f128arg a, f128arg b) NLIB_NOEXCEPT {
1196 #ifdef NLIB_F128_SIMD_NOUSE
1197  f128 ret;
1198  ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1199  ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1200  ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1201  ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1202  return ret;
1203 #elif defined(NLIB_SSE41)
1204  return _mm_max_ps(a, b);
1205 #elif defined(NLIB_NEON)
1206  return vmaxq_f32(a, b);
1207 #elif defined(NLIB_CAFE_PPC)
1208  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1209  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1210  f128 ret;
1211  ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1212  ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1213  return ret;
1214 #endif
1215 }
1216 
1217 // r[i] = min(a[i], b[i])
1218 NLIB_M(f128) F128::Min(f128arg a, f128arg b) NLIB_NOEXCEPT {
1219 #ifdef NLIB_F128_SIMD_NOUSE
1220  f128 ret;
1221  ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1222  ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1223  ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1224  ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1225  return ret;
1226 #elif defined(NLIB_SSE41)
1227  return _mm_min_ps(a, b);
1228 #elif defined(NLIB_NEON)
1229  return vminq_f32(a, b);
1230 #elif defined(NLIB_CAFE_PPC)
1231  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1232  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1233  f128 ret;
1234  ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1235  ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1236  return ret;
1237 #endif
1238 }
1239 
1240 // r[0] = max(a[0], a[1]), r[1] = max(a[2], a[3]), r[2] = max(b[0], b[1]), r[3] = max(b[2], b[3])
1241 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT {
1242 #ifdef NLIB_F128_SIMD_NOUSE
1243  f128 ret;
1244  ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1245  ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1246  ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1247  ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1248  return ret;
1249 #elif defined(NLIB_SSE41)
1250  f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1251  f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1252  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1253 #elif defined(NLIB_NEON)
1254 # ifdef __aarch64__
1255  return vpmaxq_f32(a, b);
1256 # else
1257  float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1258  float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1259  return vcombine_f32(rl, rh);
1260 # endif
1261 #elif defined(NLIB_CAFE_PPC)
1262  f32x2 v02, v13, cmp;
1263  f128 ret;
1264  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1265  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1266  cmp = __PS_SUB(v02, v13);
1267  ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1268  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1269  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1270  cmp = __PS_SUB(v02, v13);
1271  ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1272  return ret;
1273 #endif
1274 }
1275 
1276 // r[0] = min(a[0], a[1]), r[1] = min(a[2], a[3]), r[2] = min(b[0], b[1]), r[3] = min(b[2], b[3])
1277 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT {
1278 #ifdef NLIB_F128_SIMD_NOUSE
1279  f128 ret;
1280  ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1281  ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1282  ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1283  ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1284  return ret;
1285 #elif defined(NLIB_SSE41)
1286  f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1287  f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1288  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1289 #elif defined(NLIB_NEON)
1290 # ifdef __aarch64__
1291  return vpminq_f32(a, b);
1292 # else
1293  float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1294  float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1295  return vcombine_f32(rl, rh);
1296 # endif
1297 #elif defined(NLIB_CAFE_PPC)
1298  f32x2 v02, v13, cmp;
1299  f128 ret;
1300  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1301  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1302  cmp = __PS_SUB(v02, v13);
1303  ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1304  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1305  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1306  cmp = __PS_SUB(v02, v13);
1307  ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1308  return ret;
1309 #endif
1310 }
1311 
1312 // r[0] = a[0] + a[1], r[1] = a[2] + a[3], ...
1313 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT {
1314 #ifdef NLIB_F128_SIMD_NOUSE
1315  f128 ret;
1316  ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1317  ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1318  ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1319  ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1320  return ret;
1321 #elif defined(NLIB_SSE41)
1322  return _mm_hadd_ps(a, b);
1323 #elif defined(NLIB_NEON)
1324 # ifdef __aarch64__
1325  return vpaddq_f32(a, b);
1326 # else
1327  float32x2_t al = vget_low_f32(a);
1328  float32x2_t ah = vget_high_f32(a);
1329  float32x2_t l = vpadd_f32(al, ah);
1330 
1331  float32x2_t bl = vget_low_f32(b);
1332  float32x2_t bh = vget_high_f32(b);
1333  float32x2_t h = vpadd_f32(bl, bh);
1334  return vcombine_f32(l, h);
1335 # endif
1336 #elif defined(NLIB_CAFE_PPC)
1337  f32x2 v02, v13, cmp;
1338  f128 ret;
1339  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1340  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1341  ret.vec.ps[0] = __PS_ADD(v02, v13);
1342  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1343  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1344  ret.vec.ps[1] = __PS_ADD(v02, v13);
1345  return ret;
1346 #endif
1347 }
1348 
1349 // r[i] = fabs(a[i] - b[i])
1350 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT {
1351 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1352  return vabdq_f32(a, b);
1353 #else
1354  return F128::Abs(F128::Sub(a, b));
1355 #endif
1356 }
1357 
1358 // r[i] = c[i] + a[i] * b[i]
1359 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1360 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1361  return vmlaq_f32(c, a, b);
1362 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1363  f128 ret;
1364  ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1365  ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1366  return ret;
1367 #else
1368  return F128::Add(c, F128::Mult(a, b));
1369 #endif
1370 }
1371 
1372 // r[i] = c[i] + a * b[i]
1373 NLIB_M(f128) F128::MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1374 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1375  return vmlaq_n_f32(c, b, a);
1376 #else
1377  return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1378 #endif
1379 }
1380 
1381 template <size_t N>
1382 // r[i] = c[i] + a[N] * b[i]
1383 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1384  each_select32_tag) NLIB_NOEXCEPT {
1385  NLIB_STATIC_ASSERT(N < 4);
1386 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1387 # if __aarch64__
1388  return vmlaq_laneq_f32(c, b, a, N);
1389 # else
1390  return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1391 # endif
1392 #else
1393  return F128::MultAdd(F128::SetValue<N>(a, each_select32), b, c);
1394 #endif
1395 }
1396 
1397 // r[i] = c[i] - a[i] * b[i]
1398 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1399 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1400  return vmlsq_f32(c, a, b);
1401 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1402  f128 ret;
1403  ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1404  ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1405  return ret;
1406 #else
1407  return F128::Sub(c, F128::Mult(a, b));
1408 #endif
1409 }
1410 
1411 // r[i] = c[i] - a * b[i]
1412 NLIB_M(f128) F128::MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1413 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1414  return vmlsq_n_f32(c, b, a);
1415 #else
1416  return F128::MultSub(F128::SetValue(a, each_float), b, c);
1417 #endif
1418 }
1419 
1420 template <size_t N>
1421 // r[i] = c[i] - a[N] * b[i]
1422 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1423  each_select32_tag) NLIB_NOEXCEPT {
1424  NLIB_STATIC_ASSERT(N < 4);
1425 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1426 # ifdef __arch64__
1427  return vmlsq_laneq_f32(c, b, a, N);
1428 # else
1429  return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1430 # endif
1431 #else
1432  return F128::MultSub(F128::SetValue<N>(a, each_select32), b, c);
1433 #endif
1434 }
1435 
1436 // r[i] = a[i] + t[i] * (b[i] - a[i])
1437 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT {
1438  // a + t * (b - a)
1439  return F128::MultAdd(t, F128::Sub(b, a), a);
1440 }
1441 
1442 // -pi <= angle1, angle2, r < pi
1443 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1444  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1445  // -pi <= ret < pi
1446  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1447  f128 zero = F128::SetZero();
1448  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1449 
1450  f128 sum = F128::Add(angle1, angle2);
1451  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1452  f128 ofs = F128::Select(cond, pi_dbl, zero);
1453  f128 result = F128::Add(sum, ofs);
1454  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1455  ofs = F128::Select(cond, pi_dbl, zero);
1456  return F128::Sub(result, ofs);
1457 }
1458 
1459 // -pi <= angle1, angle2, r < pi
1460 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1461  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1462  // -pi <= ret < pi
1463  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1464  f128 zero = F128::SetZero();
1465  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1466 
1467  f128 sum = F128::Sub(angle1, angle2);
1468  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1469  f128 ofs = F128::Select(cond, pi_dbl, zero);
1470  f128 result = F128::Add(sum, ofs);
1471  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1472  ofs = F128::Select(cond, pi_dbl, zero);
1473  return F128::Sub(result, ofs);
1474 }
1475 
1476 // ( 2 1 -2 1) (p0)
1477 // (t^3 t^2 t 1) (-3 -2 3 -1) (v0)
1478 // ( 0 1 0 0) (p1)
1479 // ( 1 0 0 0) (v1)
1480 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1481  f128arg_ex t) NLIB_NOEXCEPT {
1482  // (2 * p0 + v0 - 2 * p1 + v1) * t^3 + (-3 * p0 - 2 * v0 + 3 * p1 - v1) * t^2
1483  // + v0 * t + p0
1484  // ==
1485  // (2 * t^3 - 3 * t^2 + 1) * p0 + (t^3 - 2 * t^2 + t) * v0
1486  // + (-2 * t^3 + 3 * t^2) * p1 + (t^3 - t^2) * v1
1487  f128 tt = F128::Mult(t, t);
1488  f128 ttt = F128::Mult(tt, t);
1489  ttt = F128::Mult(ttt, F128::LoadA16(hermite_R0_));
1490  ttt = F128::MultAdd(tt, F128::LoadA16(hermite_R1_), ttt);
1491  ttt = F128::MultAdd(t, F128::LoadA16(hermite_R2_), ttt);
1492  ttt = F128::Add(ttt, F128::LoadA16(hermite_R3_));
1493 
1494  // vec(ttt) * mtx(p0, v0, p1, v1)
1495  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1496  result = F128::MultAdd<1>(ttt, v0, result, each_select32);
1497  result = F128::MultAdd<2>(ttt, p1, result, each_select32);
1498  result = F128::MultAdd<3>(ttt, v1, result, each_select32);
1499  return result;
1500 }
1501 
1502 // (-1 3 -3 1) (p0)
1503 // 0.5 * (t^3 t^2 t 1) ( 2 -5 4 -1) (p1)
1504 // (-1 0 1 0) (p2)
1505 // ( 0 2 0 0) (p3)
1506 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1507  f128arg_ex t) NLIB_NOEXCEPT {
1508  f128 tt = F128::Mult(t, t);
1509  f128 ttt = F128::Mult(tt, t);
1510  ttt = F128::Mult(ttt, F128::LoadA16(catmull_R0_));
1511  ttt = F128::MultAdd(tt, F128::LoadA16(catmull_R1_), ttt);
1512  ttt = F128::MultAdd(t, F128::LoadA16(catmull_R2_), ttt);
1513  ttt = F128::Add(ttt, F128::LoadA16(catmull_R3_));
1514 
1515  // vec(ttt) * mtx(p0, p1, p2, p3)
1516  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1517  result = F128::MultAdd<1>(ttt, p1, result, each_select32);
1518  result = F128::MultAdd<2>(ttt, p2, result, each_select32);
1519  result = F128::MultAdd<3>(ttt, p3, result, each_select32);
1520 
1521  return result;
1522 }
1523 
1524 // p0 + f * (p1 - p0) + g * (p2 - p0)
1525 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1526  f128arg_ex g) NLIB_NOEXCEPT {
1527  f128 p1p0 = F128::Sub(p1, p0);
1528  f128 p2p0 = F128::Sub(p2, p0);
1529  f128 tmp = F128::MultAdd(f, p1p0, p0);
1530  return F128::MultAdd(g, p2p0, tmp);
1531 }
1532 
1533 // r[i] = a[i] & b[i]
1534 NLIB_M(f128) F128::And(f128arg a, f128arg b) NLIB_NOEXCEPT {
1535 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1536  f128 ret;
1537  ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1538  ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1539  ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1540  ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1541  return ret;
1542 #elif defined(NLIB_SSE41)
1543  return _mm_and_ps(a, b);
1544 #elif defined(NLIB_NEON)
1545  uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1546  return vreinterpretq_f32_u32(tmp);
1547 #endif
1548 }
1549 
1550 // r[i] = a[i] | b[i]
1551 NLIB_M(f128) F128::Or(f128arg a, f128arg b) NLIB_NOEXCEPT {
1552 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1553  f128 ret;
1554  ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1555  ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1556  ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1557  ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1558  return ret;
1559 #elif defined(NLIB_SSE41)
1560  return _mm_or_ps(a, b);
1561 #elif defined(NLIB_NEON)
1562  uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1563  return vreinterpretq_f32_u32(tmp);
1564 #endif
1565 }
1566 
1567 // r[i] = a[i] ^ b[i]
1568 NLIB_M(f128) F128::Xor(f128arg a, f128arg b) NLIB_NOEXCEPT {
1569 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1570  f128 ret;
1571  ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1572  ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1573  ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1574  ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1575  return ret;
1576 #elif defined(NLIB_SSE41)
1577  return _mm_xor_ps(a, b);
1578 #elif defined(NLIB_NEON)
1579  uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1580  return vreinterpretq_f32_u32(tmp);
1581 #endif
1582 }
1583 
1584 // r[i] = ~a[i]
1585 NLIB_M(f128) F128::Not(f128arg a) NLIB_NOEXCEPT {
1586 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1587  f128 ret;
1588  ret.vec.u[0] = ~a.vec.u[0];
1589  ret.vec.u[1] = ~a.vec.u[1];
1590  ret.vec.u[2] = ~a.vec.u[2];
1591  ret.vec.u[3] = ~a.vec.u[3];
1592  return ret;
1593 #elif defined(NLIB_SSE41)
1594  return _mm_andnot_ps(a, F128::CmpEq(a, a));
1595 #elif defined(NLIB_NEON)
1596  uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1597  return vreinterpretq_f32_u32(tmp);
1598 #endif
1599 }
1600 
1601 // r[i] = ~a[i] & b[i]
1602 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1603 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1604  f128 ret;
1605  ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1606  ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1607  ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1608  ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1609  return ret;
1610 #elif defined(NLIB_SSE41)
1611  return _mm_andnot_ps(a, b);
1612 #elif defined(NLIB_NEON)
1613  uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1614  return vreinterpretq_f32_u32(tmp);
1615 #endif
1616 }
1617 
1618 // r[i] = ~a[i] | b[i]
1619 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1620 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1621  f128 ret;
1622  ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1623  ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1624  ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1625  ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1626  return ret;
1627 #elif defined(NLIB_SSE41)
1628  return _mm_or_ps(F128::Not(a), b);
1629 #elif defined(NLIB_NEON)
1630  uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1631  return vreinterpretq_f32_u32(tmp);
1632 #endif
1633 }
1634 
1635 // r[i] = (a[i] == b[i]) ? 0xFFFFFFFF : 0
1636 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT {
1637 #ifdef NLIB_F128_SIMD_NOUSE
1638  f128 ret;
1639  ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1640  ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1641  ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1642  ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1643  return ret;
1644 #elif defined(NLIB_SSE41)
1645  return _mm_cmpeq_ps(a, b);
1646 #elif defined(NLIB_NEON)
1647  uint32x4_t tmp = vceqq_f32(a, b);
1648  return vreinterpretq_f32_u32(tmp);
1649 #elif defined(NLIB_CAFE_PPC)
1650  f32x2 x0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1651  f32x2 x1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1652  x0 = __PS_MUL(x0, __PS_NEG(x0));
1653  x1 = __PS_MUL(x1, __PS_NEG(x1));
1654  f32x2 one = __PS_FDUP(1.f);
1655  f32x2 minus_one = __PS_NEG(one);
1656  f128 ret;
1657  ret.vec.ps[0] = __PS_SEL(x0, minus_one, one);
1658  ret.vec.ps[1] = __PS_SEL(x1, minus_one, one);
1659  return ret;
1660 #endif
1661 }
1662 
1663 // r[i] = (absf(a[i] - b[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1664 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT {
1665  f128 tmp = F128::AbsDiff(a, b);
1666  return F128::CmpLe(tmp, eps);
1667 }
1668 
1669 // r[i] = clamp(value[i], min[i], max[i])
1670 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT {
1671  return F128::Min(max, F128::Max(min, value));
1672 }
1673 
1674 // r[i] = absf(value[i]) <= bounds[i] ? 0xFFFFFFFF : 0
1675 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT {
1676 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1677  uint32x4_t tmp = vcaleq_f32(value, bounds);
1678  return vreinterpretq_f32_u32(tmp);
1679 #else
1680  return F128::CmpLe(F128::Abs(value), bounds);
1681 #endif
1682 }
1683 
1684 // r[i] = 1.f / value[i] with higher precision, set infinity if value[i] == 0
1685 NLIB_M2(f128) F128::Recp(f128arg value) NLIB_NOEXCEPT {
1686 #ifdef NLIB_F128_SIMD_NOUSE
1687  f128 ret;
1688  ret.vec.v[0] = 1.f / value.vec.v[0];
1689  ret.vec.v[1] = 1.f / value.vec.v[1];
1690  ret.vec.v[2] = 1.f / value.vec.v[2];
1691  ret.vec.v[3] = 1.f / value.vec.v[3];
1692  return ret;
1693 #elif defined(NLIB_SSE41)
1694  return _mm_div_ps(F128::SetOne(), value);
1695 #elif defined(NLIB_NEON)
1696  float32x4_t x;
1697  x = vrecpeq_f32(value);
1698  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x1 = x0 * (2 - x0 * value)
1699  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x2 = x1 * (2 - x1 * value)
1700  uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1701  float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1702  return result;
1703 #elif defined(NLIB_CAFE_PPC)
1704  return F128::Div(F128::SetOne(), value);
1705 #endif
1706 }
1707 
1708 // r[i] = 1.f / value[i] with lower precision
1709 NLIB_M(f128) F128::RecpEst(f128arg value) NLIB_NOEXCEPT {
1710 #ifdef NLIB_F128_SIMD_NOUSE
1711  f128 ret;
1712  ret.vec.v[0] = 1.f / value.vec.v[0];
1713  ret.vec.v[1] = 1.f / value.vec.v[1];
1714  ret.vec.v[2] = 1.f / value.vec.v[2];
1715  ret.vec.v[3] = 1.f / value.vec.v[3];
1716  return ret;
1717 #elif defined(NLIB_SSE41)
1718  return _mm_rcp_ps(value);
1719 #elif defined(NLIB_NEON)
1720  return vrecpeq_f32(value);
1721 #elif defined(NLIB_CAFE_PPC)
1722  f128 ret;
1723  ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1724  ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1725  return ret;
1726 #endif
1727 }
1728 
1729 // r[i] = sqrtf(value[i]) with higher precision
1730 NLIB_M2(f128) F128::Sqrt(f128arg value) NLIB_NOEXCEPT {
1731 #ifdef NLIB_F128_SIMD_NOUSE
1732  f128 ret;
1733  ret.vec.v[0] = sqrtf(value.vec.v[0]);
1734  ret.vec.v[1] = sqrtf(value.vec.v[1]);
1735  ret.vec.v[2] = sqrtf(value.vec.v[2]);
1736  ret.vec.v[3] = sqrtf(value.vec.v[3]);
1737  return ret;
1738 #elif defined(NLIB_SSE41)
1739  return _mm_sqrt_ps(value);
1740 #elif defined(NLIB_NEON)
1741  f128 zero = F128::SetZero();
1742  f128 iszero = F128::CmpEq(zero, value);
1743  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1744  return F128::Select(iszero, zero, result);
1745 #elif defined(NLIB_CAFE_PPC)
1746  f128 zero = F128::SetZero();
1747  f128 iszero = F128::CmpEq(zero, value);
1748  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1749  return F128::Select(iszero, zero, result);
1750 #endif
1751 }
1752 
1753 // r[i] = sqrtf(value[i]) with lower precision
1754 NLIB_M(f128) F128::SqrtEst(f128arg value) NLIB_NOEXCEPT {
1755 #ifdef NLIB_F128_SIMD_NOUSE
1756  f128 ret;
1757  ret.vec.v[0] = sqrtf(value.vec.v[0]);
1758  ret.vec.v[1] = sqrtf(value.vec.v[1]);
1759  ret.vec.v[2] = sqrtf(value.vec.v[2]);
1760  ret.vec.v[3] = sqrtf(value.vec.v[3]);
1761  return ret;
1762 #elif defined(NLIB_SSE41)
1763  return _mm_sqrt_ps(value);
1764 #elif defined(NLIB_NEON)
1765  return vrecpeq_f32(vrsqrteq_f32(value));
1766 #elif defined(NLIB_CAFE_PPC)
1767  f128 ret;
1768  ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
1769  ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
1770  return ret;
1771 #endif
1772 }
1773 
1774 // r[i] = sqrtf(1.f / value[i]) with higher precision
1775 NLIB_M2(f128) F128::RecpSqrt(f128arg value) NLIB_NOEXCEPT {
1776 #ifdef NLIB_F128_SIMD_NOUSE
1777  f128 ret;
1778  ret.vec.v[0] = 1.f / sqrtf(value.vec.v[0]);
1779  ret.vec.v[1] = 1.f / sqrtf(value.vec.v[1]);
1780  ret.vec.v[2] = 1.f / sqrtf(value.vec.v[2]);
1781  ret.vec.v[3] = 1.f / sqrtf(value.vec.v[3]);
1782  return ret;
1783 #elif defined(NLIB_SSE41)
1784  return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
1785 #elif defined(NLIB_NEON)
1786  float32x4_t x;
1787  x = vrsqrteq_f32(value);
1788  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
1789  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
1790  uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1791  float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1792  return result;
1793 #elif defined(NLIB_CAFE_PPC)
1794  f32x2 three = __PS_FDUP(3.f);
1795  f32x2 half = __PS_FDUP(0.5f);
1796  f32x2 x;
1797  f32x2 xx;
1798  f32x2 v;
1799  f128 ret;
1800 
1801  v = value.vec.ps[0];
1802  x = __PS_RSQRTE(v);
1803 
1804  xx = __PS_MUL(x, x);
1805  xx = __PS_NMSUB(v, xx, three);
1806  xx = __PS_MUL(x, xx);
1807  x = __PS_MUL(half, xx);
1808 
1809  xx = __PS_MUL(x, x);
1810  xx = __PS_NMSUB(v, xx, three);
1811  xx = __PS_MUL(x, xx);
1812  ret.vec.ps[0] = __PS_MUL(half, xx);
1813 
1814  v = value.vec.ps[1];
1815  x = __PS_RSQRTE(v);
1816 
1817  xx = __PS_MUL(x, x);
1818  xx = __PS_NMSUB(v, xx, three);
1819  xx = __PS_MUL(x, xx);
1820  x = __PS_MUL(half, xx);
1821 
1822  xx = __PS_MUL(x, x);
1823  xx = __PS_NMSUB(v, xx, three);
1824  xx = __PS_MUL(x, xx);
1825  ret.vec.ps[1] = __PS_MUL(half, xx);
1826 
1827  f128 iszero = F128::CmpEq(F128::SetZero(), value);
1828  f128 inf = F128::SetInfinity();
1829  return F128::Select(iszero, inf, ret);
1830 #endif
1831 }
1832 
1833 // r[i] = sqrtf(1.f / value[i]) with lower precision
1834 NLIB_M(f128) F128::RecpSqrtEst(f128arg value) NLIB_NOEXCEPT {
1835 #ifdef NLIB_F128_SIMD_NOUSE
1836  f128 ret;
1837  ret.vec.v[0] = 1.f / sqrtf(value.vec.v[0]);
1838  ret.vec.v[1] = 1.f / sqrtf(value.vec.v[1]);
1839  ret.vec.v[2] = 1.f / sqrtf(value.vec.v[2]);
1840  ret.vec.v[3] = 1.f / sqrtf(value.vec.v[3]);
1841  return ret;
1842 #elif defined(NLIB_SSE41)
1843  return _mm_rsqrt_ps(value);
1844 #elif defined(NLIB_NEON)
1845  return vrsqrteq_f32(value);
1846 #elif defined(NLIB_CAFE_PPC)
1847  f128 ret;
1848  ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
1849  ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
1850  return ret;
1851 #endif
1852 }
1853 
1854 template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
1855 NLIB_M(f128) F128::NegateEx(f128arg value) NLIB_NOEXCEPT {
1856  const size_t lane0 = NegateLane0 ? 4 : 0;
1857  const size_t lane1 = NegateLane1 ? 5 : 1;
1858  const size_t lane2 = NegateLane2 ? 6 : 2;
1859  const size_t lane3 = NegateLane3 ? 7 : 3;
1860  return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
1861 }
1862 
1863 template <>
1864 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value) NLIB_NOEXCEPT {
1865  return value;
1866 }
1867 
1868 template <>
1869 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value) NLIB_NOEXCEPT {
1870  return F128::Negate(value);
1871 }
1872 
1873 #ifndef NLIB_F128_SIMD_NOUSE
1874 #ifdef NLIB_NEON
1875 template <>
1876 NLIB_M(f128) F128::NegateEx<true, false, true, false>(f128arg value) NLIB_NOEXCEPT {
1877 #ifdef __aarch64__
1878  float32x4_t tmp = vnegq_f32(value);
1879  tmp = vcopyq_laneq_f32(tmp, 1, value, 1);
1880  tmp = vcopyq_laneq_f32(tmp, 3, value, 3);
1881  return tmp;
1882 #else
1883  float32x2x2_t tmp = vtrn_f32(vget_low_f32(value), vget_high_f32(value));
1884  tmp.val[0] = vneg_f32(tmp.val[0]);
1885  tmp = vtrn_f32(tmp.val[0], tmp.val[1]);
1886  return vcombine_f32(tmp.val[0], tmp.val[1]);
1887 #endif
1888 }
1889 
1890 template <>
1891 NLIB_M(f128) F128::NegateEx<false, true, false, true>(f128arg value) NLIB_NOEXCEPT {
1892 #ifdef __aarch64__
1893  float32x4_t tmp = vnegq_f32(value);
1894  tmp = vcopyq_laneq_f32(tmp, 0, value, 0);
1895  tmp = vcopyq_laneq_f32(tmp, 2, value, 2);
1896  return tmp;
1897 #else
1898  float32x2x2_t tmp = vtrn_f32(vget_low_f32(value), vget_high_f32(value));
1899  tmp.val[1] = vneg_f32(tmp.val[1]);
1900  tmp = vtrn_f32(tmp.val[0], tmp.val[1]);
1901  return vcombine_f32(tmp.val[0], tmp.val[1]);
1902 #endif
1903 }
1904 
1905 template <>
1906 NLIB_M(f128) F128::NegateEx<false, false, true, true>(f128arg value) NLIB_NOEXCEPT {
1907  float32x2_t lo = vget_low_f32(value);
1908  float32x2_t hi = vneg_f32(vget_high_f32(value));
1909  return vcombine_f32(lo, hi);
1910 }
1911 
1912 template <>
1913 NLIB_M(f128) F128::NegateEx<true, true, false, false>(f128arg value) NLIB_NOEXCEPT {
1914  float32x2_t lo = vneg_f32(vget_low_f32(value));
1915  float32x2_t hi = vget_high_f32(value);
1916  return vcombine_f32(lo, hi);
1917 }
1918 #elif defined(NLIB_SSE41)
1919 template <>
1920 NLIB_M(f128) F128::NegateEx<true, false, true, false>(f128arg value) NLIB_NOEXCEPT {
1921  return _mm_addsub_ps(SetZero(), value);
1922 }
1923 
1924 template <>
1925 NLIB_M(f128) F128::NegateEx<false, true, false, true>(f128arg value) NLIB_NOEXCEPT {
1926  f128 zero = SetZero();
1927  return _mm_sub_ps(zero, _mm_addsub_ps(zero, value));
1928 }
1929 #endif
1930 #endif
1931 
1932 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1933 #define NLIB_ISNAN(vec, idx) \
1934  ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0)
1935 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U)
1936 #endif
1937 
1938 // r[i] = isnan(value[i]) ? 0xFFFFFFFF : 0
1939 NLIB_M2(f128) F128::IsNaN(f128arg value) NLIB_NOEXCEPT {
1940 #if defined(NLIB_F128_SIMD_NOUSE)
1941  f128 ret;
1942  ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
1943  ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
1944  ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
1945  ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
1946  return ret;
1947 #elif defined(NLIB_CAFE_PPC)
1948  // on CAFE, value is NaN if value < 0 && -value < 0
1949  f32x2 one = __PS_FDUP(1.f);
1950  f32x2 minus_one = __PS_NEG(one);
1951  f32x2 v0 = value.vec.ps[0];
1952  f32x2 v1 = value.vec.ps[1];
1953  f32x2 t0 = __PS_SEL(v0, one, minus_one);
1954  f32x2 t1 = __PS_SEL(v1, one, minus_one);
1955  f128 ret;
1956  f32x2 v0neg = __PS_NEG(v0);
1957  f32x2 v1neg = __PS_NEG(v1);
1958  ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
1959  ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
1960  return ret;
1961 #else
1962  return F128::CmpNe(value, value);
1963 #endif
1964 }
1965 
1966 // r[i] = isinf(value[i]) ? 0xFFFFFFFF : 0
1967 NLIB_M(f128) F128::IsInfinite(f128arg value) NLIB_NOEXCEPT {
1968 #if defined(NLIB_F128_SIMD_NOUSE)
1969  f128 ret;
1970  ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
1971  ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
1972  ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
1973  ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
1974  return ret;
1975 #elif defined(NLIB_CAFE_PPC)
1976  f128 ret;
1977  f32x2 big_value = __PS_FDUP(FLT_MAX);
1978  ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
1979  ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
1980  return ret;
1981 #else
1982  f128 inf_value = F128::SetInfinity();
1983  f128 abs_value = F128::Abs(value);
1984  return F128::CmpEq(inf_value, abs_value);
1985 #endif
1986 }
1987 
1988 // for example, 6.54321 -> 7.0, -6.54321 -> -7.0
1989 NLIB_M(f128) F128::Round(f128arg value) NLIB_NOEXCEPT {
1990 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
1991  return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1992 #else
1993  // Add and Sub a big value to round the number after the decimal point
1994  f128 sgn = F128::And(value, F128::SetSignMask());
1995  f128 sm = F128::Or(F128::SetValue(0x4B000000U, each_uint32), sgn);
1996  f128 result = F128::Sub(F128::Add(value, sm), sm);
1997  return result;
1998 #endif
1999 }
2000 
2001 // for example, 6.54321 -> 6.0, -6.54321 -> -6.0
2002 NLIB_M2(f128) F128::Truncate(f128arg value) NLIB_NOEXCEPT {
2003 // Note that there is no fraction if |value| > 2^23
2004 // 2^23 = 8388608
2005 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2006  f128 ret;
2007  for (size_t i = 0; i < 4; ++i) {
2008  if (NLIB_ISNAN(value.vec, i)) {
2009  ret.vec.u[i] = 0x7FC00000U;
2010  } else {
2011  ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2012  ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2013  : value.vec.v[i];
2014  }
2015  }
2016  return ret;
2017 #elif defined(NLIB_SSE41)
2018  return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2019 #else
2020  f128 x = F128::Abs(value);
2021  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2022  f128 cond = F128::CmpLt(x, c_2_23);
2023  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2024  return F128::Select(cond, casted, value);
2025 #endif
2026 }
2027 
2028 // for example, 6.54321 -> 6.0, -6.54321 -> -7.0
2029 NLIB_M2(f128) F128::Floor(f128arg value) NLIB_NOEXCEPT {
2030 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2031  f128 ret;
2032  ret.vec.v[0] = floorf(value.vec.v[0]);
2033  ret.vec.v[1] = floorf(value.vec.v[1]);
2034  ret.vec.v[2] = floorf(value.vec.v[2]);
2035  ret.vec.v[3] = floorf(value.vec.v[3]);
2036  return ret;
2037 #elif defined(NLIB_SSE41)
2038  return _mm_floor_ps(value);
2039 #else
2040  // Note that there is no fraction if |value| > 2^23
2041  // 2^23 = 8388608
2042  f128 x = F128::Abs(value);
2043  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2044  f128 cond = F128::CmpLt(x, c_2_23);
2045  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2046 
2047  // -1 if result is larger
2048  f128 largeMask = F128::CmpGt(casted, value);
2049  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2050  casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(largeMask)));
2051  return F128::Select(cond, casted, value);
2052 #endif
2053 }
2054 
2055 // for example, 6.54321 -> 7.0, -6.54321 -> -6.0
2056 NLIB_M2(f128) F128::Ceil(f128arg value) NLIB_NOEXCEPT {
2057 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2058  f128 ret;
2059  ret.vec.v[0] = ceilf(value.vec.v[0]);
2060  ret.vec.v[1] = ceilf(value.vec.v[1]);
2061  ret.vec.v[2] = ceilf(value.vec.v[2]);
2062  ret.vec.v[3] = ceilf(value.vec.v[3]);
2063  return ret;
2064 #elif defined(NLIB_SSE41)
2065  return _mm_ceil_ps(value);
2066 #else
2067  // Note that there is no fraction if |value| > 2^23
2068  // 2^23 = 8388608
2069  f128 x = F128::Abs(value);
2070  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2071  f128 cond = F128::CmpLt(x, c_2_23);
2072  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2073 
2074  // +1 if result is smaller
2075  f128 smallMask = F128::CmpLt(casted, value);
2076  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2077  casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(smallMask)));
2078  return F128::Select(cond, casted, value);
2079 #endif
2080 }
2081 
2082 #ifdef NLIB_F128_SIMD_NOUSE
2083 #undef NLIB_ISNAN
2084 #undef NLIB_ISINF
2085 #endif
2086 
2087 // r[i] = clamp(value[i], { 0, 0, 0, 0 }, { 1, 1, 1, 1 })
2088 NLIB_M(f128) F128::Saturate(f128arg value) NLIB_NOEXCEPT {
2089  return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2090 }
2091 
2092 NLIB_M2(f128) F128::ModAngle(f128arg value) NLIB_NOEXCEPT {
2093  static const float v_1_2pi = 0.15915494309189535f;
2094  static const float v_2pi = 6.283185307179586f;
2095  // value - 2pi * round(value * (1/2pi)) to be [-pi, pi)
2096  const f128 recpTwoPi = F128::SetValue(v_1_2pi, each_float);
2097  f128 round = F128::Round(F128::Mult(value, recpTwoPi));
2098  const f128 twoPi = F128::SetValue(v_2pi, each_float);
2099  return F128::MultSub(twoPi, round, value);
2100 }
2101 
2102 NLIB_M2(f128) F128::Sin(f128arg value) NLIB_NOEXCEPT {
2103  // within [-pi, pi)
2104  f128 x = F128::ModAngle(value);
2105 
2106  // within [-pi/2, pi/2]
2107  // use sin(x) == sin(pi - x), sin(x) == sin(-pi - x)
2108  // |x| <= pi/2 -> x = x
2109  // x > pi/2 -> x = pi - x
2110  // x < -pi/2 -> x = -pi - x
2111  f128 pi_ = F128::LoadA16(F128::pi_values_);
2112  f128 pi = F128::SetValue<0>(pi_, each_select32);
2113  f128 pidiv2 = F128::SetValue<3>(pi_, each_select32);
2114 
2115  f128 xabs = F128::Abs(value);
2116  f128 xsign = F128::And(F128::SetSignMask(), x);
2117  f128 mypi = F128::Or(xsign, pi);
2118  f128 pi_x = F128::Sub(mypi, x);
2119  f128 cond = F128::CmpLe(xabs, pidiv2);
2120  x = F128::Select(cond, x, pi_x);
2121 
2122  f128 xx = F128::Mult(x, x);
2123  f128 coeff = F128::LoadA16(sin_coeff_);
2124  f128 result;
2125  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2126 
2127  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2128  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2129  result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[4], each_float));
2130  result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[5], each_float));
2131  result = F128::Mult(xx, result);
2132  result = F128::MultSub(result, x, x);
2133  return result;
2134 }
2135 
2136 NLIB_M2(f128) F128::Cos(f128arg value) NLIB_NOEXCEPT {
2137  // within [-pi, pi)
2138  f128 x = F128::ModAngle(value);
2139 
2140  // within [-pi/2, pi/2]
2141  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2142  // |x| <= pi/2 -> x = x
2143  // x > pi/2 -> x = pi - x
2144  // x < -pi/2 -> x = -pi - x
2145  f128 cvalue = F128::LoadA16(cos_cvalue_);
2146 
2147  f128 xabs = F128::Abs(value);
2148  f128 xsign = F128::And(F128::SetSignMask(), x);
2149  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2150  f128 pi_x = F128::Sub(mypi, x);
2151  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2152  x = F128::Select(cond, x, pi_x);
2153 
2154  // +1 if [-pi/2, pi/2], -1 otherwise
2155  f128 sign = F128::Select(cond, F128::SetValue<2>(cvalue, each_select32), // 1
2156  F128::SetValue<3>(cvalue, each_select32)); // -1
2157 
2158  // xx = x^2
2159  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2160  f128 xx = F128::Mult(x, x);
2161  f128 coeff = F128::LoadA16(cos_coeff_);
2162  f128 result;
2163  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2164 
2165  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2166  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2167  result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[4], each_float));
2168  result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[5], each_float));
2169  result = F128::MultSub(xx, result, F128::SetOne());
2170  result = F128::Mult(sign, result);
2171  return result;
2172 }
2173 
2174 NLIB_M2(f128x2) F128::SinCos(f128arg value) NLIB_NOEXCEPT {
2175  // within [-pi, pi)
2176  f128 x = F128::ModAngle(value);
2177 
2178  // within [-pi/2, pi/2]
2179  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2180  // |x| <= pi/2 -> x = x
2181  // x > pi/2 -> x = pi - x
2182  // x < -pi/2 -> x = -pi - x
2183  f128 cvalue = F128::LoadA16(cos_cvalue_);
2184 
2185  f128 xabs = F128::Abs(value);
2186  f128 xsign = F128::And(F128::SetSignMask(), x);
2187  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2188  f128 pi_x = F128::Sub(mypi, x);
2189  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2190  x = F128::Select(cond, x, pi_x);
2191 
2192  // +1 if [-pi/2, pi/2], -1 otherwise
2193  f128 sign = F128::Select(cond, F128::SetValue<2>(cvalue, each_select32), // 1
2194  F128::SetValue<3>(cvalue, each_select32)); // -1
2195 
2196  // xx = x^2
2197  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2198  f128 xx = F128::Mult(x, x);
2199  f128x2 ret;
2200 
2201  // cos
2202  {
2203  f128 coeff = F128::LoadA16(cos_coeff_);
2204  f128 result;
2205  result =
2206  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2207 
2208  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2209  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2210  result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[4], each_float));
2211  result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[5], each_float));
2212  result = F128::MultSub(xx, result, F128::SetOne());
2213 
2214  ret.val[1] = F128::Mult(sign, result); // cos
2215  }
2216 
2217  // sin
2218  {
2219  f128 coeff = F128::LoadA16(sin_coeff_);
2220  f128 result;
2221  result =
2222  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2223 
2224  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2225  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2226  result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[4], each_float));
2227  result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[5], each_float));
2228  result = F128::Mult(xx, result);
2229  ret.val[0] = F128::MultSub(result, x, x); // sin
2230  }
2231  return ret;
2232 }
2233 
2234 NLIB_M2(f128) F128::ArcTan(f128arg value) NLIB_NOEXCEPT {
2235  // |value| <= 1 -> atan(value)
2236  // value > 1 -> pi/2 - atan(1/value)
2237  // value < -1 -> -pi/2 - atan(1/value)
2238  f128 cmp, value_sign;
2239  {
2240  f128 one = F128::SetOne();
2241  f128 negative_one = F128::SetNegativeOne();
2242 
2243  // value_sign:
2244  // 1 if value > 1,
2245  // -1 if value < -1
2246  value_sign = F128::Select(F128::CmpGt(value, one), one, negative_one);
2247  cmp = F128::CmpLe(F128::Abs(value), one);
2248  }
2249  f128 x = F128::Select(cmp, value, F128::Recp(value));
2250 
2251  // atan(x) = x - 1/3 * x^3 + ... + (-1)^n/(2n+1) * x^(2n+1)
2252  // = x(1 - xx(1/3 - xx(1/5 - xx(1/7 - xx(1/9 - xx(1/11 - xx(1/13 - xx(1/15 - xx(1/17)...)
2253  // NOTE:
2254  // DO NOT USE TAYLOR SERIES
2255  // minmax approximation(the output of Remez algorithm)
2256  f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2257  f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2258  f128 xx = F128::Mult(x, x);
2259  f128 result;
2260  result = F128::MultSub<3>(coeff1, xx, F128::SetValue<2>(coeff1, each_select32), each_select32);
2261  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1, each_select32));
2262  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1, each_select32));
2263  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0, each_select32));
2264  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0, each_select32));
2265  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0, each_select32));
2266  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0, each_select32));
2267 
2268  result = F128::Mult(result, x);
2269  result = F128::MultSub(xx, result, x);
2270 
2271  f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2272  f128 result_another = F128::Sub(F128::Mult(value_sign, pi_2), result);
2273  result = F128::Select(cmp, result, result_another);
2274  return result;
2275 }
2276 
2277 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT {
2278  // y / x -> value
2279  // 0 / - -> pi | sign(y)
2280  // 0 / + -> 0 | sign(y)
2281  // y / Inf -> 0 | sign(y)
2282  // y / -Inf -> pi | sign(y)
2283  // y!=0 / 0 -> pi/2 | sign(y)
2284  // y!=0 / - -> atan(y/x) + pi | sign(y)
2285  // Inf / x -> pi/2 | sign(y)
2286  // +-Inf / Inf -> pi/4 | sign(y)
2287  // +-Inf / -Inf -> 3pi/4 | sign(y)
2288  // otherwise -> atan(y/x)
2289 
2290  // sx = sign(x), sy = sign(y)
2291  // infx = isinf(x), infy = isinf(y)
2292  // zerox = iszero(x), zeroy = iszero(y)
2293  // posx = x > 0
2294  const f128 signmask = F128::SetSignMask();
2295  // const f128 sx = F128::And(x, signmask);
2296  const f128 sy = F128::And(y, signmask);
2297  const f128 infx = F128::IsInfinite(x);
2298  const f128 infy = F128::IsInfinite(y);
2299  f128 zero = F128::SetZero();
2300  const f128 zerox = F128::CmpEq(x, zero);
2301  const f128 zeroy = F128::CmpEq(y, zero);
2302  const f128 posx = F128::CmpGt(x, zero);
2303 
2304  // v =
2305  // infy ?
2306  // infx ?
2307  // posx ? (pi/4 | sy) : (3pi/4 | sy)
2308  // : (pi/2 | sy)
2309  // : zeroy ?
2310  // posx ? (0 | sy) : (pi | sy)
2311  // : zerox ? (pi/2 | sy) : TrueMask;
2312  zero = F128::Or(zero, sy);
2313  const f128 cval = F128::LoadA16(atan2_cvalue_);
2314  const f128 pi = F128::Or(zero, F128::SetValue<0>(cval, each_select32));
2315  const f128 pi_34 = F128::Or(zero, F128::SetValue<1>(cval, each_select32));
2316  const f128 pi_2 = F128::Or(zero, F128::SetValue<2>(cval, each_select32));
2317  const f128 pi_4 = F128::Or(zero, F128::SetValue<3>(cval, each_select32));
2318 #if defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
2319  const f128 full = F128::SetValue(0xFF7FFFFFUL, each_uint32);
2320 #else
2321  const f128 full = F128::CmpEq(zero, zero);
2322 #endif
2323 
2324  f128 v = F128::Select(
2325  infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2326  F128::Select(zeroy, F128::Select(posx, zero, pi), F128::Select(zerox, pi_2, full)));
2327 
2328 // mask = EqInt(v, full);
2329 // result = Atan(y/x) + (posx ? (0 | sy) : (pi | sy))
2330 // return mask ? result : v;
2331 #if defined(NLIB_F128_SIMD_NOUSE)
2332  f128 mask;
2333  mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2334  mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2335  mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2336  mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2337 #elif defined(NLIB_CAFE_PPC)
2338  // select makes 0xFFFFFFFFUL -> 0xFF7FFFFFUL
2339  f128 mask;
2340  mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2341  mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2342  mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2343  mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2344 #else
2345  f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v), F128::CastToI128(full)));
2346 #endif
2347  f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::Select(posx, zero, pi));
2348  return F128::Select(mask, result, v);
2349 }
2350 
2351 NLIB_M2(f128) F128::ArcSin(f128arg value) NLIB_NOEXCEPT {
2352  // asin(x) = atan2 (x, sqrt ((1.0 + x) * (1.0 - x)))
2353  f128 one = F128::SetOne();
2354  f128 zero = F128::SetZero();
2355  f128 tmp = F128::MultSub(value, value, one);
2356  f128 argx = F128::Sqrt(F128::Select(F128::CmpLt(tmp, zero), zero, tmp));
2357  return F128::ArcTan2(value, argx);
2358 }
2359 
2360 NLIB_M2(f128) F128::ArcCos(f128arg value) NLIB_NOEXCEPT {
2361  // acos(x) = atan2 (sqrt ((1.0 + x) * (1.0 - x)), x)
2362  f128 one = F128::SetOne();
2363  f128 zero = F128::SetZero();
2364  f128 tmp = F128::MultSub(value, value, one);
2365  f128 argx = F128::Sqrt(F128::Select(F128::CmpLt(tmp, zero), zero, tmp));
2366  return F128::ArcTan2(argx, value);
2367 }
2368 
2369 // see _mm_movemask_ps of SSE
2370 NLIB_M2(int) F128::MoveMask(f128arg value) NLIB_NOEXCEPT { // NOLINT
2371 #ifdef NLIB_F128_SIMD_NOUSE
2372  uint8_t ret = 0;
2373  ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2374  ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2375  ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2376  ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2377  return ret;
2378 #elif defined(NLIB_SSE41)
2379  return static_cast<uint8_t>(_mm_movemask_ps(value));
2380 #elif defined(NLIB_NEON)
2381  NLIB_ALIGNAS(16) const uint64_t powers_[2] = {0x0000000200000001U, 0x0000000800000004U};
2382  uint64x2_t powers = vld1q_u64(powers_);
2383  uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(value),
2384  vreinterpretq_u32_u64(powers));
2385  uint8x16_t mask = vreinterpretq_u8_u64(vpaddlq_u32(tmp));
2386  uint8_t result = vgetq_lane_u8(mask, 0) + vgetq_lane_u8(mask, 8);
2387  return result;
2388 #elif defined(NLIB_CAFE_PPC)
2389  int tmp = (value.vec.u[0] >> 31);
2390  tmp |= (value.vec.u[1] >> 30) & 2;
2391  tmp |= (value.vec.u[2] >> 29) & 4;
2392  tmp |= (value.vec.u[3] >> 28) & 8;
2393  return tmp;
2394 #endif
2395 }
2396 
2397 // true if value[i] == 0 for all i
2398 NLIB_M2(bool) F128::IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT { // NOLINT
2399 #ifdef NLIB_F128_SIMD_NOUSE
2400  return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2401 #elif defined(NLIB_SSE41)
2402  i128 casted = F128::CastToI128(value);
2403  return _mm_testz_si128(casted, casted) != 0;
2404 #elif defined(NLIB_NEON)
2405 # ifdef __aarch64__
2406  uint32x4_t casted = vreinterpretq_u32_f32(value);
2407  casted = vpmaxq_u32(casted, casted);
2408  return vgetq_lane_u64(vreinterpretq_u64_u32(casted), 0) == 0;
2409 # else
2410  int32x4_t casted = vreinterpretq_s32_f32(value);
2411  int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2412  return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2413 # endif
2414 #elif defined(NLIB_CAFE_PPC)
2415  uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2416  return (tmp & 0x80000000U) == 0;
2417 #endif
2418 }
2419 
2420 // true if value[i] == 0xFFFFFFFF for all i
2421 NLIB_M2(bool) F128::IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT { // NOLINT
2422 #ifdef NLIB_F128_SIMD_NOUSE
2423  return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2424  value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2425 #elif defined(NLIB_SSE41)
2426  i128 casted = F128::CastToI128(value);
2427  return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2428 #elif defined(NLIB_NEON)
2429 # ifdef __aarch64__
2430  uint32x4_t casted = vreinterpretq_u32_f32(value);
2431  casted = vpminq_u32(casted, casted);
2432  return vgetq_lane_s64(vreinterpretq_s64_u32(casted), 0) == -1;
2433 # else
2434  int32x4_t casted = vreinterpretq_s32_f32(value);
2435  int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2436  return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2437 # endif
2438 #elif defined(NLIB_CAFE_PPC)
2439  uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2440  return (tmp & 0x80000000U) != 0;
2441 #endif
2442 }
2443 
2444 template <size_t N>
2445 // r = value[N]
2446 NLIB_M(float) F128::GetFloatFromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2447  NLIB_STATIC_ASSERT(N < 4);
2448 #ifdef NLIB_F128_SIMD_NOUSE
2449  return value.vec.v[N];
2450 #elif defined(NLIB_SSE41)
2451  float dest;
2452  _MM_EXTRACT_FLOAT(dest, value, N);
2453  return dest;
2454 #elif defined(NLIB_NEON)
2455  return vgetq_lane_f32(value, N);
2456 #elif defined(NLIB_CAFE_PPC)
2457  return value.vec.ps[N / 2][N % 2];
2458 #endif
2459 }
2460 
2461 template <size_t N>
2462 // r = *reinterpret_cast<uint32_t*>(&value[N])
2463 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2464  NLIB_STATIC_ASSERT(N < 4);
2465 #ifdef NLIB_F128_SIMD_NOUSE
2466  return value.vec.u[N];
2467 #elif defined(NLIB_SSE41)
2468  return _mm_extract_ps(value, N);
2469 #elif defined(NLIB_NEON)
2470  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2471  return vgetq_lane_u32(tmp, N);
2472 #elif defined(NLIB_CAFE_PPC)
2473  return value.vec.u[N];
2474 #endif
2475 }
2476 
2477 // r = value[idx]
2478 NLIB_M2(float) F128::GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT { // NOLINT
2479 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2480  return value.vec.v[idx];
2481 #elif defined(NLIB_SSE41)
2482  float dest;
2483  switch (idx) {
2484  case 0:
2485  _MM_EXTRACT_FLOAT(dest, value, 0);
2486  break;
2487  case 1:
2488  _MM_EXTRACT_FLOAT(dest, value, 1);
2489  break;
2490  case 2:
2491  _MM_EXTRACT_FLOAT(dest, value, 2);
2492  break;
2493  case 3:
2494  _MM_EXTRACT_FLOAT(dest, value, 3);
2495  break;
2496  default:
2497  NLIB_ASSUME(0);
2498  break;
2499  }
2500  return dest;
2501 #elif defined(NLIB_NEON)
2502  switch (idx) {
2503  case 0:
2504  return vgetq_lane_f32(value, 0);
2505  case 1:
2506  return vgetq_lane_f32(value, 1);
2507  case 2:
2508  return vgetq_lane_f32(value, 2);
2509  case 3:
2510  return vgetq_lane_f32(value, 3);
2511  default:
2512  NLIB_ASSUME(0);
2513  break;
2514  }
2515 #endif
2516 }
2517 
2518 // r = *reinterpret_cast<uint32_t*>(&value[idx])
2519 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT {
2520 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2521  return value.vec.u[idx];
2522 #elif defined(NLIB_SSE41)
2523  switch (idx) {
2524  case 0:
2525  return static_cast<uint32_t>(_mm_extract_ps(value, 0));
2526  case 1:
2527  return static_cast<uint32_t>(_mm_extract_ps(value, 1));
2528  case 2:
2529  return static_cast<uint32_t>(_mm_extract_ps(value, 2));
2530  case 3:
2531  return static_cast<uint32_t>(_mm_extract_ps(value, 3));
2532  default:
2533  NLIB_ASSUME(0);
2534  break;
2535  }
2536 #elif defined(NLIB_NEON)
2537  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2538  switch (idx) {
2539  case 0:
2540  return vgetq_lane_u32(tmp, 0);
2541  case 1:
2542  return vgetq_lane_u32(tmp, 1);
2543  case 2:
2544  return vgetq_lane_u32(tmp, 2);
2545  case 3:
2546  return vgetq_lane_u32(tmp, 3);
2547  default:
2548  NLIB_ASSUME(0);
2549  break;
2550  }
2551 #endif
2552 }
2553 
2554 template <size_t N>
2555 // r = value, r[N] = v
2556 NLIB_M(f128) F128::SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT { // NOLINT
2557  NLIB_STATIC_ASSERT(N < 4);
2558 #ifdef NLIB_F128_SIMD_NOUSE
2559  f128 ret = value;
2560  ret.vec.v[N] = v;
2561  return ret;
2562 #elif defined(NLIB_SSE41)
2563  f128 tmp = _mm_set_ss(v);
2564  return _mm_insert_ps(value, tmp, N << 4);
2565 #elif defined(NLIB_NEON)
2566  return vsetq_lane_f32(v, value, N);
2567 #elif defined(NLIB_CAFE_PPC)
2568  f128 ret = value;
2569  ret.vec.ps[N / 2][N % 2] = v;
2570  return ret;
2571 #endif
2572 }
2573 
2574 // r = value, r[i] = v
2575 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT {
2576 #ifdef NLIB_F128_SIMD_NOUSE
2577  f128 ret = value;
2578  ret.vec.v[i] = v;
2579  return ret;
2580 #elif defined(NLIB_SSE41)
2581  f128 tmp = _mm_set_ss(v);
2582  switch (i) {
2583  case 0:
2584  return _mm_insert_ps(value, tmp, 0x00);
2585  case 1:
2586  return _mm_insert_ps(value, tmp, 0x10);
2587  case 2:
2588  return _mm_insert_ps(value, tmp, 0x20);
2589  case 3:
2590  return _mm_insert_ps(value, tmp, 0x30);
2591  default:
2592  NLIB_ASSUME(0);
2593  break;
2594  }
2595 #elif defined(NLIB_NEON)
2596  switch (i) {
2597  case 0:
2598  return vsetq_lane_f32(v, value, 0);
2599  case 1:
2600  return vsetq_lane_f32(v, value, 1);
2601  case 2:
2602  return vsetq_lane_f32(v, value, 2);
2603  case 3:
2604  return vsetq_lane_f32(v, value, 3);
2605  default:
2606  NLIB_ASSUME(0);
2607  break;
2608  }
2609 #elif defined(NLIB_CAFE_PPC)
2610  f128 ret = value;
2611  switch (i) {
2612  case 0:
2613  ret.vec.ps[0][0] = v;
2614  break;
2615  case 1:
2616  ret.vec.ps[0][1] = v;
2617  break;
2618  case 2:
2619  ret.vec.ps[1][0] = v;
2620  break;
2621  default:
2622  ret.vec.ps[1][1] = v;
2623  break;
2624  }
2625  return ret;
2626 #endif
2627 }
2628 
2629 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
2630 namespace detail {
2631 
2632 template <bool IsHighA, bool IsHighB>
2633 float32x2_t F64Merge(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT;
2634 
2635 template <>
2636 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2637 #ifdef __aarch64__
2638  return vtrn1_f32(a, b);
2639 #else
2640  return vtrn_f32(a, b).val[0];
2641 #endif
2642 };
2643 
2644 template <>
2645 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2646 #ifdef __aarch64__
2647  return vtrn1_f32(vrev64_f32(a), b);
2648 #else
2649  return vtrn_f32(vrev64_f32(a), b).val[0];
2650 #endif
2651 };
2652 
2653 template <>
2654 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2655 #ifdef __aarch64__
2656  return vtrn1_f32(a, vrev64_f32(b));
2657 #else
2658  return vtrn_f32(a, vrev64_f32(b)).val[0];
2659 #endif
2660 };
2661 
2662 template <>
2663 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2664 #ifdef __aarch64__
2665  return vtrn2_f32(a, b);
2666 #else
2667  return vtrn_f32(a, b).val[1];
2668 #endif
2669 };
2670 
2671 template <size_t Z>
2672 float32x2_t F128SwizzleGet64(f128arg value) NLIB_NOEXCEPT;
2673 
2674 template <>
2675 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<0>(f128arg value) NLIB_NOEXCEPT {
2676  return vget_low_f32(value);
2677 }
2678 
2679 template <>
2680 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<1>(f128arg value) NLIB_NOEXCEPT {
2681  return vget_high_f32(value);
2682 }
2683 
2684 template <size_t X0, size_t X1>
2685 struct F128SwizzleHelper2 {
2686  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2687  float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2688  float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2689  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2690  }
2691 };
2692 
2693 template <size_t X>
2694 struct F128SwizzleHelper2<X, X> {
2695  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2696  float32x2_t x = F128SwizzleGet64<X / 2>(value);
2697  return vdup_lane_f32(x, (X & 1));
2698  }
2699 };
2700 
2701 template <>
2702 struct F128SwizzleHelper2<0, 1> {
2703  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2704  return vget_low_f32(value);
2705  }
2706 };
2707 
2708 template <>
2709 struct F128SwizzleHelper2<0, 2> {
2710  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2711 #ifdef __aarch64__
2712  return vget_low_f32(vuzp1q_f32(value, value));
2713 #else
2714  float32x2_t lo = vget_low_f32(value);
2715  float32x2_t hi = vget_high_f32(value);
2716  return vzip_f32(lo, hi).val[0];
2717 #endif
2718  }
2719 };
2720 
2721 template <>
2722 struct F128SwizzleHelper2<0, 3> {
2723  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2724  float32x2_t lo = vget_low_f32(value);
2725  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2726 #ifdef __aarch64__
2727  return vzip1_f32(lo, hi);
2728 #else
2729  return vzip_f32(lo, hi).val[0];
2730 #endif
2731  }
2732 };
2733 
2734 template <>
2735 struct F128SwizzleHelper2<1, 0> {
2736  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2737  return vrev64_f32(vget_low_f32(value));
2738  }
2739 };
2740 
2741 template <>
2742 struct F128SwizzleHelper2<1, 2> {
2743  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2744  float32x2_t lo = vget_low_f32(value);
2745  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2746 #ifdef __aarch64__
2747  return vzip2_f32(lo, hi);
2748 #else
2749  return vzip_f32(lo, hi).val[1];
2750 #endif
2751  }
2752 };
2753 
2754 template <>
2755 struct F128SwizzleHelper2<1, 3> {
2756  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2757 #ifdef __aarch64__
2758  return vget_low_f32(vuzp2q_f32(value, value));
2759 #else
2760  float32x2_t lo = vget_low_f32(value);
2761  float32x2_t hi = vget_high_f32(value);
2762  return vzip_f32(lo, hi).val[1];
2763 #endif
2764  }
2765 };
2766 
2767 template <>
2768 struct F128SwizzleHelper2<2, 0> {
2769  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2770 #ifdef __aarch64__
2771  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2772 #else
2773  float32x2_t lo = vget_low_f32(value);
2774  float32x2_t hi = vget_high_f32(value);
2775  return vzip_f32(hi, lo).val[0];
2776 #endif
2777  }
2778 };
2779 
2780 template <>
2781 struct F128SwizzleHelper2<2, 1> {
2782  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2783 #ifdef __aarch64__
2784  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2785 #else
2786  float32x2_t lo = vget_low_f32(value);
2787  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2788  return vzip_f32(hi, lo).val[1];
2789 #endif
2790  }
2791 };
2792 
2793 template <>
2794 struct F128SwizzleHelper2<2, 3> {
2795  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2796  return vget_high_f32(value);
2797  }
2798 };
2799 
2800 template <>
2801 struct F128SwizzleHelper2<3, 0> {
2802  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2803  float32x2_t lo = vget_low_f32(value);
2804  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2805 #ifdef __aarch64__
2806  return vzip1_f32(hi, lo);
2807 #else
2808  return vzip_f32(hi, lo).val[0];
2809 #endif
2810  }
2811 };
2812 
2813 template <>
2814 struct F128SwizzleHelper2<3, 1> {
2815  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2816  float32x2_t lo = vget_low_f32(value);
2817  float32x2_t hi = vget_high_f32(value);
2818 #ifdef __aarch64__
2819  return vzip2_f32(hi, lo);
2820 #else
2821  return vzip_f32(hi, lo).val[1];
2822 #endif
2823  }
2824 };
2825 
2826 template <>
2827 struct F128SwizzleHelper2<3, 2> {
2828  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2829  return vrev64_f32(vget_high_f32(value));
2830  }
2831 };
2832 
2833 template <size_t V0, size_t V1, size_t V2, size_t V3>
2834 struct F128SwizzleHelper {
2835  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2836  return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
2837  detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
2838  }
2839 };
2840 
2841 template <size_t Vx, size_t Vy>
2842 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
2843  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2844  float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
2845  return vcombine_f32(tmp, tmp);
2846  }
2847 };
2848 
2849 template <size_t V>
2850 struct F128SwizzleHelper<V, V, V, V> {
2851  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2852  return F128::SetValue<V>(value, each_select32);
2853  }
2854 };
2855 
2856 } // namespace detail
2857 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
2858 namespace detail {
2859 
2860 template <size_t X0, size_t X1>
2861 struct F128SwizzleHelper {
2862  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT;
2863 };
2864 
2865 template<>
2866 struct F128SwizzleHelper<0, 0> {
2867  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2868  (void)v1;
2869  return __PS_MERGE00(v0, v0);
2870  }
2871 };
2872 
2873 template<>
2874 struct F128SwizzleHelper<0, 1> {
2875  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2876  (void)v1;
2877  return v0;
2878  }
2879 };
2880 
2881 template<>
2882 struct F128SwizzleHelper<0, 2> {
2883  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2884  return __PS_MERGE00(v0, v1);
2885  }
2886 };
2887 
2888 template<>
2889 struct F128SwizzleHelper<0, 3> {
2890  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2891  return __PS_MERGE01(v0, v1);
2892  }
2893 };
2894 
2895 template<>
2896 struct F128SwizzleHelper<1, 0> {
2897  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2898  (void)v1;
2899  return __PS_MERGE10(v0, v0);
2900  }
2901 };
2902 
2903 template<>
2904 struct F128SwizzleHelper<1, 1> {
2905  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2906  (void)v1;
2907  return __PS_MERGE11(v0, v0);
2908  }
2909 };
2910 
2911 template<>
2912 struct F128SwizzleHelper<1, 2> {
2913  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2914  return __PS_MERGE10(v0, v1);
2915  }
2916 };
2917 
2918 template<>
2919 struct F128SwizzleHelper<1, 3> {
2920  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2921  return __PS_MERGE11(v0, v1);
2922  }
2923 };
2924 
2925 template<>
2926 struct F128SwizzleHelper<2, 0> {
2927  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2928  return __PS_MERGE00(v1, v0);
2929  }
2930 };
2931 
2932 template<>
2933 struct F128SwizzleHelper<2, 1> {
2934  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2935  return __PS_MERGE01(v1, v0);
2936  }
2937 };
2938 
2939 template<>
2940 struct F128SwizzleHelper<2, 2> {
2941  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2942  (void)v0;
2943  return __PS_MERGE00(v1, v1);
2944  }
2945 };
2946 
2947 template<>
2948 struct F128SwizzleHelper<2, 3> {
2949  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2950  (void)v0;
2951  return v1;
2952  }
2953 };
2954 
2955 template<>
2956 struct F128SwizzleHelper<3, 0> {
2957  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2958  return __PS_MERGE10(v1, v0);
2959  }
2960 };
2961 
2962 template<>
2963 struct F128SwizzleHelper<3, 1> {
2964  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2965  return __PS_MERGE11(v1, v0);
2966  }
2967 };
2968 
2969 template<>
2970 struct F128SwizzleHelper<3, 2> {
2971  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2972  (void)v0;
2973  return __PS_MERGE10(v1, v1);
2974  }
2975 };
2976 
2977 template<>
2978 struct F128SwizzleHelper<3, 3> {
2979  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
2980  (void)v0;
2981  return __PS_MERGE11(v1, v1);
2982  }
2983 };
2984 
2985 } // namespace detail
2986 #endif
2987 
2988 template <size_t V0, size_t V1, size_t V2, size_t V3>
2989 // r[0] = value[V0], r[1] = value[V1], r[2] = value[V2], r[3] = value[V3]
2990 NLIB_M(f128) F128::Swizzle(f128arg value) NLIB_NOEXCEPT {
2991  NLIB_STATIC_ASSERT(V0 < 4);
2992  NLIB_STATIC_ASSERT(V1 < 4);
2993  NLIB_STATIC_ASSERT(V2 < 4);
2994  NLIB_STATIC_ASSERT(V3 < 4);
2995 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2996  f128 ret;
2997  ret.vec.v[0] = value.vec.v[V0];
2998  ret.vec.v[1] = value.vec.v[V1];
2999  ret.vec.v[2] = value.vec.v[V2];
3000  ret.vec.v[3] = value.vec.v[V3];
3001  return ret;
3002 #elif defined(NLIB_SSE41)
3003  return _mm_shuffle_ps(value, value, _MM_SHUFFLE(V3, V2, V1, V0));
3004 #elif defined(NLIB_NEON)
3005  return detail::F128SwizzleHelper<V0, V1, V2, V3>::Swizzle(value);
3006 #elif defined(NLIB_CAFE_PPC)
3007  f128 ret;
3008  ret.vec.ps[0] = detail::F128::SwizzleHelper<V0, V1>(value.vec.ps[0], value.vec.ps[1]);
3009  ret.vec.ps[1] = detail::F128::SwizzleHelper<V2, V3>(value.vec.ps[0], value.vec.ps[1]);
3010  return ret;
3011 #endif
3012 }
3013 
3014 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3015 // Swizzle specialization for NEON
3016 template <>
3017 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value) NLIB_NOEXCEPT {
3018 #ifdef __aarch64__
3019  return vzip1q_f32(value, value);
3020 #else
3021  return vzipq_f32(value, value).val[0];
3022 #endif
3023 }
3024 template <>
3025 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value) NLIB_NOEXCEPT {
3026 #ifdef __aarch64__
3027  return vtrn1q_f32(value, value);
3028 #else
3029  return vtrnq_f32(value, value).val[0];
3030 #endif
3031 }
3032 template <>
3033 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value) NLIB_NOEXCEPT {
3034  return value;
3035 }
3036 template <>
3037 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value) NLIB_NOEXCEPT {
3038 #ifdef __aarch64__
3039  return vuzp1q_f32(value, value);
3040 #else
3041  return vuzpq_f32(value, value).val[0];
3042 #endif
3043 }
3044 template <>
3045 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value) NLIB_NOEXCEPT {
3046  return vrev64q_f32(value);
3047 }
3048 template <>
3049 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3050 #ifdef __aarch64__
3051  return vtrn2q_f32(value, value);
3052 #else
3053  return vtrnq_f32(value, value).val[1];
3054 #endif
3055 }
3056 template <>
3057 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value) NLIB_NOEXCEPT {
3058  uint32x4_t ival = vreinterpretq_u32_f32(value);
3059  uint32x4_t rotated = vextq_u32(ival, ival, 1);
3060  return vreinterpretq_f32_u32(rotated);
3061 }
3062 template <>
3063 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value) NLIB_NOEXCEPT {
3064 #ifdef __aarch64__
3065  return vuzp2q_f32(value, value);
3066 #else
3067  return vuzpq_f32(value, value).val[1];
3068 #endif
3069 }
3070 template <>
3071 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3072 #ifdef __aarch64__
3073  return vzip2q_f32(value, value);
3074 #else
3075  return vzipq_f32(value, value).val[1];
3076 #endif
3077 }
3078 template <>
3079 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value) NLIB_NOEXCEPT {
3080  uint32x4_t ival = vreinterpretq_u32_f32(value);
3081  uint32x4_t rotated = vextq_u32(ival, ival, 2);
3082  return vreinterpretq_f32_u32(rotated);
3083 }
3084 template <>
3085 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value) NLIB_NOEXCEPT {
3086  uint32x4_t ival = vreinterpretq_u32_f32(value);
3087  uint32x4_t rotated = vextq_u32(ival, ival, 3);
3088  return vreinterpretq_f32_u32(rotated);
3089 }
3090 #endif
3091 
3092 namespace detail {
3093 
3094 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3095 template <bool UseBlend, bool UseShuffle, size_t V0, size_t V1, size_t V2, size_t V3>
3096 struct F128PermuteHelper2 {
3097  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3098  f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3099  f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3100  return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3101  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3102  }
3103 };
3104 
3105 template <bool UseShuffle, size_t V0, size_t V1, size_t V2, size_t V3>
3106 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3107  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3108  return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3109  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3110  }
3111 };
3112 
3113 template <size_t V0, size_t V1, size_t V2, size_t V3>
3114 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3115  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3116  return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3117  _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3118  }
3119 };
3120 
3121 template <>
3122 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3123  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3124  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3125  return _mm_castsi128_ps(tmp);
3126  }
3127 };
3128 
3129 template <>
3130 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3131  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3132  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3133  return _mm_castsi128_ps(tmp);
3134  }
3135 };
3136 
3137 template <>
3138 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3139  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3140  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3141  return _mm_castsi128_ps(tmp);
3142  }
3143 };
3144 
3145 template <size_t V>
3146 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3147  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3148  NLIB_STATIC_ASSERT(V > 3);
3149  return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3150  }
3151 };
3152 
3153 template <size_t V>
3154 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3155  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3156  NLIB_STATIC_ASSERT(V > 3);
3157  return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3158  }
3159 };
3160 
3161 template <size_t V>
3162 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3163  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3164  NLIB_STATIC_ASSERT(V > 3);
3165  return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3166  }
3167 };
3168 
3169 template <size_t V>
3170 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3171  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3172  NLIB_STATIC_ASSERT(V > 3);
3173  return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3174  }
3175 };
3176 
3177 template <size_t V>
3178 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3179  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3180  NLIB_STATIC_ASSERT(V < 4);
3181  return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3182  }
3183 };
3184 
3185 template <size_t V>
3186 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3187  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3188  NLIB_STATIC_ASSERT(V < 4);
3189  return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3190  }
3191 };
3192 
3193 template <size_t V>
3194 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3195  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3196  NLIB_STATIC_ASSERT(V < 4);
3197  return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3198  }
3199 };
3200 
3201 template <size_t V>
3202 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3203  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3204  NLIB_STATIC_ASSERT(V < 4);
3205  return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3206  }
3207 };
3208 
3209 template <bool IsAllA, bool IsAllB, size_t V0, size_t V1, size_t V2, size_t V3>
3210 struct F128PermuteHelper {
3211  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3212  return F128PermuteHelper2<
3213  ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3214  ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3215  V0, V1, V2, V3>::Permute(a, b);
3216  }
3217 };
3218 
3219 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3220 
3221 template <size_t Z>
3222 float32x2_t F128PermuteGet64(f128arg a, f128arg b) NLIB_NOEXCEPT;
3223 
3224 template <>
3225 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3226  NLIB_UNUSED(b);
3227  return vget_low_f32(a);
3228 }
3229 template <>
3230 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3231  NLIB_UNUSED(b);
3232  return vget_high_f32(a);
3233 }
3234 template <>
3235 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3236  NLIB_UNUSED(a);
3237  return vget_low_f32(b);
3238 }
3239 template <>
3240 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3241  NLIB_UNUSED(a);
3242  return vget_high_f32(b);
3243 }
3244 
3245 template <size_t X0, size_t X1>
3246 struct F128PermuteHelper2 {
3247  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3248  float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3249  float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3250  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3251  }
3252 };
3253 
3254 template <size_t X>
3255 struct F128PermuteHelper2<X, X> {
3256  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3257  float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3258  return vdup_lane_f32(x, (X & 1));
3259  }
3260 };
3261 
3262 template <bool IsAllA, bool IsAllB, size_t V0, size_t V1, size_t V2, size_t V3>
3263 struct F128PermuteHelper {
3264  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3265  return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3266  F128PermuteHelper2<V2, V3>::Permute(a, b));
3267  }
3268 };
3269 
3270 template <>
3271 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3272  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3273  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3274  return vreinterpretq_f32_s32(tmp);
3275  }
3276 };
3277 
3278 template <>
3279 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3280  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3281  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3282  return vreinterpretq_f32_s32(tmp);
3283  }
3284 };
3285 
3286 template <>
3287 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3288  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3289  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3290  return vreinterpretq_f32_s32(tmp);
3291  }
3292 };
3293 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
3294 template<size_t R0, size_t R1, size_t VAR0, size_t VAR1>
3295 struct F128PermuteHelper2 {
3296  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT;
3297 };
3298 
3299 template<size_t R0, size_t R1>
3300 struct F128PermuteHelper2<R0, R1, 0, 0> {
3301  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3302  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3303  }
3304 };
3305 
3306 template<size_t R0, size_t R1>
3307 struct F128PermuteHelper2<R0, R1, 0, 1> {
3308  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3309  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3310  }
3311 };
3312 
3313 template<size_t R0, size_t R1>
3314 struct F128PermuteHelper2<R0, R1, 0, 2> {
3315  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3316  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3317  }
3318 };
3319 
3320 template<size_t R0, size_t R1>
3321 struct F128PermuteHelper2<R0, R1, 0, 3> {
3322  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3323  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3324  }
3325 };
3326 
3327 template<size_t R0, size_t R1>
3328 struct F128PermuteHelper2<R0, R1, 1, 0> {
3329  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3330  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3331  }
3332 };
3333 
3334 template<size_t R0, size_t R1>
3335 struct F128PermuteHelper2<R0, R1, 1, 1> {
3336  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3337  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3338  }
3339 };
3340 
3341 template<size_t R0, size_t R1>
3342 struct F128PermuteHelper2<R0, R1, 1, 2> {
3343  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3344  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3345  }
3346 };
3347 
3348 template<size_t R0, size_t R1>
3349 struct F128PermuteHelper2<R0, R1, 1, 3> {
3350  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3351  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3352  }
3353 };
3354 
3355 template<size_t R0, size_t R1>
3356 struct F128PermuteHelper2<R0, R1, 2, 0> {
3357  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3358  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3359  }
3360 };
3361 
3362 template<size_t R0, size_t R1>
3363 struct F128PermuteHelper2<R0, R1, 2, 1> {
3364  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3365  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3366  }
3367 };
3368 
3369 template<size_t R0, size_t R1>
3370 struct F128PermuteHelper2<R0, R1, 2, 2> {
3371  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3372  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3373  }
3374 };
3375 
3376 template<size_t R0, size_t R1>
3377 struct F128PermuteHelper2<R0, R1, 2, 3> {
3378  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3379  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3380  }
3381 };
3382 
3383 template<size_t R0, size_t R1>
3384 struct F128PermuteHelper2<R0, R1, 3, 0> {
3385  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3386  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3387  }
3388 };
3389 
3390 template<size_t R0, size_t R1>
3391 struct F128PermuteHelper2<R0, R1, 3, 1> {
3392  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3393  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3394  }
3395 };
3396 
3397 template<size_t R0, size_t R1>
3398 struct F128PermuteHelper2<R0, R1, 3, 2> {
3399  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3400  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3401  }
3402 };
3403 
3404 template<size_t R0, size_t R1>
3405 struct F128PermuteHelper2<R0, R1, 3, 3> {
3406  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3407  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3408  }
3409 };
3410 
3411 template <bool IsAllA, bool IsAllB, size_t V0, size_t V1, size_t V2, size_t V3>
3412 struct F128PermuteHelper {
3413  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3414  f128 ret;
3415  f32x2 x0 = a.vec.ps[0];
3416  f32x2 x1 = a.vec.ps[1];
3417  f32x2 x2 = b.vec.ps[0];
3418  f32x2 x3 = b.vec.ps[1];
3419  ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3420  ::Permute(x0, x1, x2, x3);
3421  ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3422  ::Permute(x0, x1, x2, x3);
3423  return ret;
3424  }
3425 };
3426 #else
3427 template <bool IsAllA, bool IsAllB, size_t V0, size_t V1, size_t V2, size_t V3>
3428 struct F128PermuteHelper {
3429  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3430  f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3431  F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3432  F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3433  F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3434  return ret;
3435  }
3436 };
3437 #endif
3438 
3439 template <size_t V0, size_t V1, size_t V2, size_t V3>
3440 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3441  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3442  NLIB_UNUSED(b);
3443  return F128::Swizzle<V0, V1, V2, V3>(a);
3444  }
3445 };
3446 
3447 template <size_t V0, size_t V1, size_t V2, size_t V3>
3448 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3449  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3450  NLIB_UNUSED(a);
3451  return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3452  }
3453 };
3454 
3455 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3456 // Permute specialization for SSE4.1
3457 template <>
3458 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3459  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3460  return _mm_unpacklo_ps(a, b);
3461  }
3462 };
3463 template <>
3464 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3465  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3466  return _mm_unpacklo_ps(b, a);
3467  }
3468 };
3469 template <>
3470 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3471  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3472  return _mm_unpackhi_ps(a, b);
3473  }
3474 };
3475 template <>
3476 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3477  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3478  return _mm_unpackhi_ps(b, a);
3479  }
3480 };
3481 #endif
3482 
3483 template<size_t V0, size_t V1, size_t V2, size_t V3>
3484 struct F128PermuteDontCareHelper {
3485  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3486  NLIB_STATIC_ASSERT(V0 < 8);
3487  NLIB_STATIC_ASSERT(V1 < 8);
3488  NLIB_STATIC_ASSERT(V2 < 8);
3489  NLIB_STATIC_ASSERT(V3 < 8);
3490  static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3491  static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3492  return detail::F128PermuteHelper< arg1, arg2,
3493  V0, V1, V2, V3 >::Permute(a, b);
3494  }
3495 };
3496 
3497 template<size_t V1, size_t V2, size_t V3>
3498 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3499  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3500  NLIB_STATIC_ASSERT(V1 < 8);
3501  NLIB_STATIC_ASSERT(V2 < 8);
3502  NLIB_STATIC_ASSERT(V3 < 8);
3503  static const size_t V0 = (V1 & 1) ? V1 - 1 : V1;
3504  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3505  }
3506 };
3507 
3508 template<size_t V0, size_t V2, size_t V3>
3509 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3510  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3511  NLIB_STATIC_ASSERT(V0 < 8);
3512  NLIB_STATIC_ASSERT(V2 < 8);
3513  NLIB_STATIC_ASSERT(V3 < 8);
3514  static const size_t V1 = (V0 & 1) ? V0 : (V0 + 1);
3515  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3516  }
3517 };
3518 
3519 template<size_t V0, size_t V1, size_t V3>
3520 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3521  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3522  NLIB_STATIC_ASSERT(V0 < 8);
3523  NLIB_STATIC_ASSERT(V1 < 8);
3524  NLIB_STATIC_ASSERT(V3 < 8);
3525  static const size_t V2 = (V3 & 1) ? V3 - 1 : V3;
3526  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3527  }
3528 };
3529 
3530 template<size_t V0, size_t V1, size_t V2>
3531 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3532  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3533  NLIB_STATIC_ASSERT(V0 < 8);
3534  NLIB_STATIC_ASSERT(V1 < 8);
3535  NLIB_STATIC_ASSERT(V2 < 8);
3536  static const size_t V3 = (V2 & 1) ? V2 : (V2 + 1);
3537  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3538  }
3539 };
3540 
3541 template<size_t V2, size_t V3>
3542 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3543  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3544  NLIB_STATIC_ASSERT(V2 < 8);
3545  NLIB_STATIC_ASSERT(V3 < 8);
3546  static const size_t V0 = (V2 < 4) ? 0 : 4;
3547  return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3548  }
3549 };
3550 
3551 template<size_t V1, size_t V2>
3552 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3553  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3554  NLIB_STATIC_ASSERT(V1 < 8);
3555  NLIB_STATIC_ASSERT(V2 < 8);
3556  static const size_t V0 = (V1 & 1) ? V1 - 1: V1;
3557  static const size_t V3 = (V2 & 1) ? V2 : V2 + 1;
3558  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3559  }
3560 };
3561 
3562 template<size_t V0, size_t V1>
3563 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3564  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3565  NLIB_STATIC_ASSERT(V0 < 8);
3566  NLIB_STATIC_ASSERT(V1 < 8);
3567  static const size_t V2 = (V1 < 4) ? 2 : 6;
3568  return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3569  }
3570 };
3571 
3572 template<size_t V0, size_t V3>
3573 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3574  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3575  NLIB_STATIC_ASSERT(V0 < 8);
3576  NLIB_STATIC_ASSERT(V3 < 8);
3577  static const size_t V1 = (V0 & 1) ? V0 : V0 + 1;
3578  static const size_t V2 = (V3 & 1) ? V3 - 1 : V3;
3579  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3580  }
3581 };
3582 
3583 template<size_t V0, size_t V2>
3584 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3585  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3586  NLIB_STATIC_ASSERT(V0 < 8);
3587  NLIB_STATIC_ASSERT(V2 < 8);
3588  static const size_t V1 = (V0 & 1) ? V0 : V0 + 1;
3589  static const size_t V3 = (V2 & 1) ? V2 : V2 + 1;
3590  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3591  }
3592 };
3593 
3594 template<size_t V1, size_t V3>
3595 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3596  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3597  NLIB_STATIC_ASSERT(V1 < 8);
3598  NLIB_STATIC_ASSERT(V3 < 8);
3599  static const size_t V0 = (V1 & 1) ? V1 - 1 : V1;
3600  static const size_t V2 = (V3 & 1) ? V3 - 1 : V3;
3601  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3602  }
3603 };
3604 
3605 template<size_t V>
3606 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3607  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3608  NLIB_STATIC_ASSERT(V < 8);
3609  static const size_t V1 = ((V & 3) == 0) ? V + 1 : V;
3610  static const size_t V2 = ((V & 3) == 0) ? V + 2 : V;
3611  static const size_t V3 = ((V & 3) == 0) ? V + 3 : V;
3612  return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3613  }
3614 };
3615 
3616 template<size_t V>
3617 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3618  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3619  NLIB_STATIC_ASSERT(V < 8);
3620  static const size_t V0 = ((V & 3) == 1) ? V - 1 : V;
3621  static const size_t V2 = ((V & 3) == 1) ? V + 1 : V;
3622  static const size_t V3 = ((V & 3) == 1) ? V + 2 : V;
3623  return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3624  }
3625 };
3626 
3627 template<size_t V>
3628 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3629  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3630  NLIB_STATIC_ASSERT(V < 8);
3631  static const size_t V0 = ((V & 3) == 2) ? V - 2 : V;
3632  static const size_t V1 = ((V & 3) == 2) ? V - 1 : V;
3633  static const size_t V3 = ((V & 3) == 2) ? V + 2 : V;
3634  return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3635  }
3636 };
3637 
3638 template<size_t V>
3639 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3640  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3641  NLIB_STATIC_ASSERT(V < 8);
3642  static const size_t V0 = ((V & 3) == 3) ? V - 3 : V;
3643  static const size_t V1 = ((V & 3) == 3) ? V - 2 : V;
3644  static const size_t V2 = ((V & 3) == 3) ? V - 1 : V;
3645  return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3646  }
3647 };
3648 
3649 template <>
3650 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3651  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3652  NLIB_UNUSED(b);
3653  return a;
3654  }
3655 };
3656 
3657 } // namespace detail
3658 
3659 template <size_t V0, size_t V1, size_t V2, size_t V3>
3660 // r[0] = V0 < 4 ? a[V0] : b[V0 - 4], .....
3661 NLIB_M(f128) F128::Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3662  return detail::F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3663 }
3664 
3665 template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
3666 // r[i] = InsertLane(i) ? insert_value[i] : value[i]
3667 // note that splat[0] = splat[1] = splat[2] = splat[3]
3668 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT {
3669 #if defined(NLIB_NEON)
3670  const size_t v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3671  const size_t v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3672  const size_t v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3673  const size_t v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3674 #else
3675  // SSE4.1 has _mm_blend_ps()
3676  const size_t v0 = SplatLane0 ? 4 : 0;
3677  const size_t v1 = SplatLane1 ? 5 : 1;
3678  const size_t v2 = SplatLane2 ? 6 : 2;
3679  const size_t v3 = SplatLane3 ? 7 : 3;
3680 #endif
3681  return F128::Permute<v0, v1, v2, v3>(value, splat);
3682 }
3683 
3684 NLIB_M2(f128) F128::Exp2(f128arg value) NLIB_NOEXCEPT {
3685 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
3686  f128 ret;
3687  ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3688  ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3689  ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3690  ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3691  return ret;
3692 #else
3693  i128 iround = F128::ConvertToI128Round(value);
3694  f128 fround = F128::ConvertFromI128(iround);
3695  f128 x = F128::Sub(value, fround);
3696  f128 xx = F128::Mult(x, x);
3697 
3698  f128 P = F128::LoadA16(F128::exp2_P_);
3699  f128 Q = F128::LoadA16(F128::exp2_Q_);
3700 
3701  f128 px;
3702  // px = P[0]
3703  // px = px * xx + P[1]
3704  // px = px * xx + P[2]
3705  // px = x * px;
3706  px = F128::MultAdd<0>(P, xx, F128::SetValue<1>(P, each_select32), each_select32);
3707  px = F128::MultAdd(px, xx, F128::SetValue<2>(P, each_select32));
3708  px = F128::Mult(x, px);
3709 
3710  f128 qx;
3711  // qx = xx + Q[0]
3712  // qx = qx * xx + Q[1]
3713  qx = F128::Add(xx, F128::SetValue<0>(Q, each_select32));
3714  qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q, each_select32));
3715 
3716  x = F128::Div(px, F128::Sub(qx, px));
3717  x = F128::MultAdd<3>(Q, x, F128::SetValue<2>(Q, each_select32), each_select32);
3718 
3719  // x = x * 2^iround
3720  iround = I128::Add32(iround, I128::SetValue(127, each_int32));
3721  iround = I128::ShiftLeftLogical32(iround, 23);
3722  x = F128::Mult(x, F128::CastFromI128(iround));
3723 
3724  // NOTE:
3725  // overflow not checked
3726  return x;
3727 #endif
3728 }
3729 
3730 NLIB_M(f128) F128::ExpE(f128arg value) NLIB_NOEXCEPT {
3731  static const float log2e = 1.44269504088896340736f;
3732  return Exp2(F128::Mult(log2e, value));
3733 }
3734 
3735 NLIB_M(f128) F128::SinH(f128arg value) NLIB_NOEXCEPT {
3736  static const float log2e = 1.44269504088896340736f;
3737  f128 negOne = F128::SetValue(-1.f, each_float);
3738  f128 v0 = F128::MultAdd(log2e, value, negOne);
3739  f128 v1 = F128::MultSub(log2e, value, negOne);
3740  f128 e0 = Exp2(v0);
3741  f128 e1 = Exp2(v1);
3742  return F128::Sub(e0, e1);
3743 }
3744 
3745 NLIB_M(f128) F128::CosH(f128arg value) NLIB_NOEXCEPT {
3746  static const float log2e = 1.44269504088896340736f;
3747  f128 negOne = F128::SetValue(-1.f, each_float);
3748  f128 v0 = F128::MultAdd(log2e, value, negOne);
3749  f128 v1 = F128::MultSub(log2e, value, negOne);
3750  f128 e0 = Exp2(v0);
3751  f128 e1 = Exp2(v1);
3752  return F128::Add(e0, e1);
3753 }
3754 
3755 NLIB_M(f128) F128::TanH(f128arg value) NLIB_NOEXCEPT {
3756  // 1 - 2 * (1 + expE(2x))
3757  f128 cvalue = F128::LoadA16(tanh_cvalue_);
3758  f128 e = F128::Mult<0>(cvalue, value, each_select32);
3759  e = F128::Exp2(e);
3760  f128 half = F128::SetValue<2>(cvalue, each_select32);
3761  e = F128::MultAdd(half, e, half);
3762  e = F128::Recp(e);
3763  return F128::Sub(F128::SetValue<1>(cvalue, each_select32), e);
3764 }
3765 
3766 NLIB_M2(f128) F128::Tan(f128arg value) NLIB_NOEXCEPT {
3767 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
3768  f128 ret;
3769  ret.vec.v[0] = tanf(value.vec.v[0]);
3770  ret.vec.v[1] = tanf(value.vec.v[1]);
3771  ret.vec.v[2] = tanf(value.vec.v[2]);
3772  ret.vec.v[3] = tanf(value.vec.v[3]);
3773  return ret;
3774 #else
3775  // Cody and Waite algorithm
3776  f128 C = F128::LoadA16(&F128::tan_c_[0]);
3777 
3778  // g = round(value / (pi/2))
3779  f128 g = F128::Round(F128::Mult<0>(C, value, each_select32));
3780  f128 nearXAxis;
3781  {
3782  i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U, each_uint32));
3783  i128 cmp = I128::CmpEq32(t0, I128::SetZero());
3784  nearXAxis = F128::CastFromI128(cmp);
3785  }
3786 
3787  // f = value - (pi/2) * g
3788  f128 f = F128::MultSub<1>(C, g, value, each_select32);
3789  f = F128::MultSub<2>(C, g, f, each_select32);
3790 
3791  f128 zero = F128::SetZero();
3792  f128 nearAxis = F128::CmpNearEq(f, zero, F128::SetValue<3>(C, each_select32));
3793 
3794  f128 P = F128::LoadA16(&F128::tan_p_[0]);
3795  f128 Q = F128::LoadA16(&F128::tan_q_[0]);
3796 
3797  f128 ff = F128::Mult(f, f);
3798  f128 one = F128::SetValue<3>(P, each_select32);
3799 
3800  f128 p = F128::MultAdd<2>(P, ff, F128::SetValue<1>(P, each_select32), each_select32);
3801  p = F128::MultAdd(p, ff, F128::SetValue<0>(P, each_select32));
3802  p = F128::MultAdd(p, ff, one);
3803  p = F128::Mult(f, p);
3804 
3805  f128 q = F128::MultAdd<3>(Q, ff, F128::SetValue<2>(Q, each_select32), each_select32);
3806  q = F128::MultAdd(q, ff, F128::SetValue<1>(Q, each_select32));
3807  q = F128::MultAdd(q, ff, F128::SetValue<0>(Q, each_select32));
3808  q = F128::MultAdd(q, ff, one);
3809 
3810  p = F128::Select(nearAxis, f, p);
3811  q = F128::Select(nearAxis, one, q);
3812 
3813  f128 r0 = F128::Div(p, q);
3814  f128 r1 = F128::Negate(F128::Recp(r0));
3815 
3816  return F128::Select(nearXAxis, r0, r1);
3817 #endif
3818 }
3819 
3820 NLIB_M2(f128) F128::Log2(f128arg value) NLIB_NOEXCEPT {
3821 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
3822  static const float scale = 1.4426950408889634f; // 1 / LogE(2.0)
3823  f128 ret;
3824  ret.vec.v[0] = logf(value.vec.v[0]);
3825  ret.vec.v[1] = logf(value.vec.v[1]);
3826  ret.vec.v[2] = logf(value.vec.v[2]);
3827  ret.vec.v[3] = logf(value.vec.v[3]);
3828  return F128::Mult(scale, ret);
3829 #else
3830  // x = frexp(value, &e)
3831  f128 x = F128::And(F128::SetValue(0x807FFFFFU, each_uint32), value);
3832  x = F128::Or(F128::SetValue(127U << 23, each_uint32), x);
3833  i128 e = I128::And(I128::SetValue(0x7F800000U, each_uint32), F128::CastToI128(value));
3834  e = I128::ShiftRightLogical32(e, 23);
3835  e = I128::Sub32(e, I128::SetValue(127U, each_uint32));
3836 
3837  x = F128::Sub(x, F128::SetOne());
3838  f128 z = F128::Mult(x, x);
3839  f128 y;
3840 
3841  f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
3842  f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
3843  f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
3844 
3845  f128 p = F128::SetValue<0>(pq0, each_select32);
3846  p = F128::MultAdd(p, x, F128::SetValue<1>(pq0, each_select32));
3847  p = F128::MultAdd(p, x, F128::SetValue<2>(pq0, each_select32));
3848  p = F128::MultAdd(p, x, F128::SetValue<3>(pq0, each_select32));
3849  p = F128::MultAdd(p, x, F128::SetValue<0>(pq1, each_select32));
3850  p = F128::MultAdd(p, x, F128::SetValue<1>(pq1, each_select32));
3851 
3852  f128 q = F128::Add(x, F128::SetValue<2>(pq1, each_select32));
3853  q = F128::MultAdd(q, x, F128::SetValue<3>(pq1, each_select32));
3854  q = F128::MultAdd(q, x, F128::SetValue<0>(pq2, each_select32));
3855  q = F128::MultAdd(q, x, F128::SetValue<1>(pq2, each_select32));
3856  q = F128::MultAdd(q, x, F128::SetValue<2>(pq2, each_select32));
3857 
3858  y = F128::Mult(z, p);
3859  y = F128::Div(y, q);
3860  y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
3861 
3862  f128 result;
3863  {
3864  // do not optimize
3865  f128 log2ea = F128::SetValue<3>(pq2, each_select32);
3866  result = F128::Mult(y, log2ea);
3867  result = F128::MultAdd(log2ea, x, result);
3868  result = F128::Add(result, y);
3869  result = F128::Add(result, x);
3870  result = F128::Add(result, F128::ConvertFromI128(e));
3871  }
3872 
3873  {
3874  f128 zero = F128::SetZero();
3875  f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
3876 
3877  // value is NaN -> NaN
3878  f128 is_nan = F128::IsNaN(value);
3879  f128 nan = F128::SetValue<0>(nan_inf, each_select32);
3880  result = F128::Select(is_nan, nan, result);
3881 
3882  f128 is_inf = F128::IsInfinite(value);
3883  f128 is_pos = F128::CmpGt(value, zero);
3884 
3885  // value == inf -> +inf
3886  f128 inf = F128::SetValue<1>(nan_inf, each_select32);
3887  f128 is_pos_inf = F128::And(is_inf, is_pos);
3888  result = F128::Select(is_pos_inf, inf, result);
3889 
3890  // value == 0 -> -inf
3891  f128 neg_inf = F128::SetValue<3>(nan_inf, each_select32);
3892  f128 is_zero = F128::CmpEq(value, zero);
3893  result = F128::Select(is_zero, neg_inf, result);
3894 
3895  // value < 0 -> -NaN
3896  f128 neg_nan = F128::SetValue<2>(nan_inf, each_select32);
3897  f128 is_neg = F128::CmpLt(value, zero);
3898  result = F128::Select(is_neg, neg_nan, result);
3899 
3900  // otherwise -> Log2(value)
3901  }
3902 
3903  return result;
3904 #endif
3905 }
3906 
3907 NLIB_M(f128) F128::LogE(f128arg value) NLIB_NOEXCEPT {
3908 #ifdef NLIB_F128_SIMD_NOUSE
3909  f128 ret;
3910  ret.vec.v[0] = logf(value.vec.v[0]);
3911  ret.vec.v[1] = logf(value.vec.v[1]);
3912  ret.vec.v[2] = logf(value.vec.v[2]);
3913  ret.vec.v[3] = logf(value.vec.v[3]);
3914  return ret;
3915 #else
3916  f128 x = F128::Log2(value);
3917  static const float recp_log2e = 0.6931471805597018f;
3918  return F128::Mult(recp_log2e, x);
3919 #endif
3920 }
3921 
3922 #undef NLIB_M
3923 #undef NLIB_M2
3924 #endif // NLIB_DOXYGEN
3925 
3926 typedef f128 SimdVector;
3927 typedef f128arg SimdVectorArg;
3928 typedef f128 SimdQuaternion;
3929 typedef f128arg SimdQuaternionArg;
3930 typedef f128 SimdPlane;
3931 typedef f128arg SimdPlaneArg;
3932 typedef f128 SimdSphere;
3933 typedef f128arg SimdSphereArg;
3934 
3935 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
3936 struct NLIB_ALIGNAS(16) SimdMatrix {
3937 #else
3938 struct SimdMatrix {
3939 #endif
3940  public:
3942  SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) NLIB_NOEXCEPT {
3943  r[0] = r0;
3944  r[1] = r1;
3945  r[2] = r2;
3946  r[3] = r3;
3947  }
3948  SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11, float m12,
3949  float m13, float m20, float m21, float m22, float m23, float m30, float m31,
3950  float m32, float m33) NLIB_NOEXCEPT;
3951  explicit SimdMatrix(const float* p) NLIB_NOEXCEPT;
3952 
3953  public:
3954  f128 r[4];
3955 };
3956 
3957 inline SimdMatrix::SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11,
3958  float m12, float m13, float m20, float m21, float m22, float m23,
3959  float m30, float m31, float m32, float m33) NLIB_NOEXCEPT {
3960  r[0] = F128::SetValue(m00, m01, m02, m03);
3961  r[1] = F128::SetValue(m10, m11, m12, m13);
3962  r[2] = F128::SetValue(m20, m21, m22, m23);
3963  r[3] = F128::SetValue(m30, m31, m32, m33);
3964 }
3965 
3966 inline SimdMatrix::SimdMatrix(const float* p) NLIB_NOEXCEPT {
3967  uintptr_t algn = reinterpret_cast<uintptr_t>(p) & 15;
3968  NLIB_ASSERT((algn & 3) == 0);
3969  switch (algn >> 2) {
3970  case 0:
3971  r[0] = F128::LoadA16(p);
3972  r[1] = F128::LoadA16(p + 4);
3973  r[2] = F128::LoadA16(p + 8);
3974  r[3] = F128::LoadA16(p + 12);
3975  break;
3976  case 1:
3977  r[0] = F128::LoadA4(p);
3978  r[1] = F128::LoadA4(p + 4);
3979  r[2] = F128::LoadA4(p + 8);
3980  r[3] = F128::LoadA4(p + 12);
3981  break;
3982  case 2:
3983  r[0] = F128::LoadA8(p);
3984  r[1] = F128::LoadA8(p + 4);
3985  r[2] = F128::LoadA8(p + 8);
3986  r[3] = F128::LoadA8(p + 12);
3987  break;
3988  default:
3989  NLIB_ASSUME(0);
3990  break;
3991  }
3992 }
3993 
3994 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
3995 typedef const SimdMatrix& SimdMatrixArg;
3996 #else
3997 typedef const SimdMatrix SimdMatrixArg;
3998 #endif
3999 
4000 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE)
4001 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4002  { \
4003  f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \
4004  f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \
4005  f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \
4006  f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \
4007  row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \
4008  row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \
4009  row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \
4010  row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \
4011  }
4012 #elif defined(NLIB_NEON)
4013 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4014  { \
4015  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4016  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4017  row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \
4018  row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \
4019  row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \
4020  row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \
4021  }
4022 #elif defined(NLIB_CAFE_PPC)
4023 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4024  { \
4025  f32x2 tmp0, tmp1; \
4026  tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \
4027  tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \
4028  row0.vec.ps[0] = tmp0; \
4029  row1.vec.ps[0] = tmp1; \
4030  tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \
4031  tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \
4032  row2.vec.ps[1] = tmp0; \
4033  row3.vec.ps[1] = tmp1; \
4034  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4035  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4036  row0.vec.ps[1] = row2.vec.ps[0]; \
4037  row1.vec.ps[1] = row3.vec.ps[0]; \
4038  row2.vec.ps[0] = tmp0; \
4039  row3.vec.ps[0] = tmp1; \
4040  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4041  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4042  row0.vec.ps[1] = tmp0; \
4043  row1.vec.ps[1] = tmp1; \
4044  }
4045 #endif
4046 
4048  float x;
4049  float y;
4050  float z;
4051 };
4052 
4054  float x;
4055  float y;
4056  float z;
4057  float w;
4058 };
4059 
4060 struct NLIB_VIS_PUBLIC Float2x3 {
4061  float m[2][3];
4062 };
4063 
4065  float m[3][3];
4066 };
4067 
4068 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4069 struct NLIB_ALIGNAS(16) Float3x4 {
4070 #else
4071 struct Float3x4 {
4072 #endif
4073  float m[3][4];
4074 };
4075 
4076 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4077 struct NLIB_ALIGNAS(16) Float4x3 {
4078 #else
4079 struct Float4x3 {
4080 #endif
4081  float m[4][3];
4082 };
4083 
4084 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4085 struct NLIB_ALIGNAS(16) Float4x4 {
4086 #else
4087 struct Float4x4 {
4088 #endif
4089  float m[4][4];
4090 };
4091 
4092 } // namespace simd
4093 NLIB_NAMESPACE_END
4094 
4095 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Platform.h:2151
float x
The x-coordinate of the 3D vector.
Definition: SimdFloat.h:4048
SimdMatrix()
Instantiates the object with default parameters (default constructor).
Definition: SimdFloat.h:3941
The class with the collection of functions that handle 4x4 matrices.
Definition: SimdMatrix.h:15
Class representing the view frustum.
Definition: SimdGeometry.h:105
The class with the collection of functions that handle quaternions.
f128arg SimdVectorArg
f128arg is defined using typedef.
Definition: SimdFloat.h:3927
The type for two SIMD registers for 128-bit, single-precision, floating-point numbers.
Definition: SimdFloat.h:31
float x
The x-coordinate for the 4D vector.
Definition: SimdFloat.h:4054
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:50
float y
The y-coordinate of the 4D vector.
Definition: SimdFloat.h:4055
Implements the class and functions for SIMD computations on integers.
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
Sets up the matrix from the parameters.
Definition: SimdFloat.h:3942
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
Definition: SimdInt.h:50
f128arg SimdSphereArg
f128arg is defined using typedef.
Definition: SimdFloat.h:3933
Class for representing oriented bounding boxes (OBB). This class has data members to hold the center ...
Definition: SimdGeometry.h:80
static f128 ShiftRight(f128arg a, f128arg b) noexcept
Sets the elements of b in shifted order to the portion of a that becomes empty when a is shifted to t...
Definition: SimdFloat.h:306
The tag for representing a single-precision floating-point number with an empty structure.
Definition: SimdFloat.h:50
#define NLIB_ASSUME(cond)
Indicates that cond is true and provides tips for optimizing the compiler.
Definition: Platform.h:393
The class with the collection of functions that determine containment relations.
Definition: SimdGeometry.h:268
The class with the collection of static member functions that handle spheres in three-dimensional spa...
Definition: SimdGeometry.h:40
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
Definition: SimdFloat.h:51
f128arg SimdQuaternionArg
f128arg is defined using typedef.
Definition: SimdFloat.h:3929
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:71
The class with the collection of functions that handle planes in three-dimensional space...
Definition: SimdGeometry.h:21
f128arg SimdPlaneArg
f128arg is defined using typedef.
Definition: SimdFloat.h:3931
The class with the collection of functions that perform calculations on three-dimensional vectors...
Definition: SimdVector3.h:13
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
static f128 RotateLeft(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the left by the amount of N...
Definition: SimdFloat.h:293
The class with the collection of functions that perform square-of-distance calculations.
Definition: SimdGeometry.h:134
The type for reading and writing four-dimensional vectors in memory. Keeps float-type x...
Definition: SimdFloat.h:4053
const f128 f128arg
const f128 or const f128& is defined using typedef.
Definition: SimdFloat.h:61
The structure for keeping a 4x4 matrix.
Definition: SimdFloat.h:3938
float z
The z-coordinate of the 4D vector.
Definition: SimdFloat.h:4056
nlib_f128x2_t f128x2
nlib_f128x2_t is defined using typedef.
Definition: SimdFloat.h:56
f128 SimdSphere
f128 is defined using typedef. Used when handling spheres.
Definition: SimdFloat.h:3932
The class for single-precision floating point SIMD computations using128-bit registers (MM0-XMM15 for...
Definition: SimdFloat.h:76
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:46
A file that contains the configuration information for each development environment.
The class with the collection of functions that perform calculations on four-dimensional vectors...
Definition: SimdVector4.h:11
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
Definition: SimdFloat.h:4047
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:209
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:56
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
Definition: SimdFloat.h:4079
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
Definition: SimdFloat.h:4064
The tag for representing an unsigned 32-bit integer with an empty structure.
Definition: SimdInt.h:36
static f128 RotateRight(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the right by the amount of N...
Definition: SimdFloat.h:300
float y
The y-coordinate of the 3D vector.
Definition: SimdFloat.h:4049
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:59
nlib_f128_t f128
nlib_f128_t is is defined using typedef.
Definition: SimdFloat.h:54
float z
The z-coordinate of the 3D vector.
Definition: SimdFloat.h:4050
Class for representing axis-aligned bounding boxes (AABB). The class has data members to hold the min...
Definition: SimdGeometry.h:61
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:51
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
Definition: SimdInt.h:42
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:117
float w
The w-coordinate of the 4D vector.
Definition: SimdFloat.h:4057
The class with the collection of functions that determine intersections.
Definition: SimdGeometry.h:178
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
Definition: SimdFloat.h:3928
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
Definition: SimdFloat.h:4087
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
Definition: SimdFloat.h:4071
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
Definition: SimdFloat.h:3930
__m128 nlib_f128_t
The type for a SIMD register for 128-bit, single-precision, floating-point numbers.
Definition: SimdFloat.h:30
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...
Definition: SimdFloat.h:3926