nlib
SimdFloat.h
[詳解]
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
5 
6 #ifdef NN_PLATFORM_CTR
7 # ifndef __USE_C99_MATH
8 # define __USE_C99_MATH
9 # endif
10 #endif
11 #include <math.h>
12 #include <float.h>
13 
14 #include "nn/nlib/Config.h"
15 #include "nn/nlib/simd/SimdInt.h"
16 
17 #ifndef INFINITY
18 #define INFINITY ((float)(1e+300 * 1e+300))
19 #endif
20 
21 #if !defined(NLIB_SIMD) && !defined(CAFE)
22 #define NLIB_F128_SIMD_NOUSE
23 #endif
24 
25 #ifdef NLIB_F128_SIMD_NOUSE
26 typedef struct {
27  union {
28  float v[4];
29  uint32_t u[4];
30  } vec;
31 } nlib_f128_t;
32 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
33 #elif defined(NLIB_SSE41)
34 typedef __m128 nlib_f128_t;
35 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
36 #elif defined(NLIB_NEON)
37 typedef float32x4_t nlib_f128_t;
38 typedef float32x4x2_t nlib_f128x2_t;
39 #elif defined(CAFE)
40 typedef struct {
41  union {
42  f32x2 ps[2];
43  float v[4];
44  uint32_t u[4];
45  } vec;
46 } nlib_f128_t;
47 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
48 #endif
49 
50 NLIB_NAMESPACE_BEGIN
51 namespace simd {
52 
53 // use each_float for the argument value
54 struct each_float_tag {};
56 
57 // __m128(SSE), float32x4_t(NEON)
58 typedef nlib_f128_t f128;
59 // float32x4x2_t(NEON)
61 
62 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
63 typedef const f128& f128arg;
64 #else
65 typedef const f128 f128arg;
66 #endif
67 
68 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
69 typedef const f128& f128arg_ex;
70 #else
71 typedef const f128 f128arg_ex;
72 #endif
73 
74 #if !defined(_MSC_VER) || _MSC_VER < 1800
75 #ifndef __vectorcall
76 #define __vectorcall
77 #endif
78 #endif
79 
81  public:
82  static f128 __vectorcall SetValue(float v, each_float_tag) NLIB_NOEXCEPT;
83  static f128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
84  static f128 __vectorcall SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT;
85  template <size_t N>
86  static f128 __vectorcall SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT;
87  static f128 __vectorcall SetZero() NLIB_NOEXCEPT;
88  static f128 __vectorcall Set1000() NLIB_NOEXCEPT;
89  static f128 __vectorcall Set0100() NLIB_NOEXCEPT;
90  static f128 __vectorcall Set0010() NLIB_NOEXCEPT;
91  static f128 __vectorcall Set0001() NLIB_NOEXCEPT;
92  template <size_t N>
93  static f128 __vectorcall SetZeroToLane(f128arg value) NLIB_NOEXCEPT;
94  static f128 __vectorcall SetOne() NLIB_NOEXCEPT;
95  static f128 __vectorcall SetNegativeOne() NLIB_NOEXCEPT;
96  static f128 __vectorcall SetEpsilon() NLIB_NOEXCEPT;
97  static f128 __vectorcall SetInfinity() NLIB_NOEXCEPT;
98  static f128 __vectorcall SetNaN() NLIB_NOEXCEPT;
99  static f128 __vectorcall SetSignMask() NLIB_NOEXCEPT;
100 
101  static f128 __vectorcall LoadA16(const float* p) NLIB_NOEXCEPT;
102  static f128 __vectorcall LoadA8(const float* p) NLIB_NOEXCEPT;
103  static f128 __vectorcall LoadA4(const float* p) NLIB_NOEXCEPT;
104  static f128 __vectorcall LoadA16(uintptr_t p) NLIB_NOEXCEPT;
105  static f128 __vectorcall LoadA8(uintptr_t p) NLIB_NOEXCEPT;
106  static f128 __vectorcall LoadA4(uintptr_t p) NLIB_NOEXCEPT;
107  static f128 __vectorcall LoadA16(intptr_t p) NLIB_NOEXCEPT;
108  static f128 __vectorcall LoadA8(intptr_t p) NLIB_NOEXCEPT;
109  static f128 __vectorcall LoadA4(intptr_t p) NLIB_NOEXCEPT;
110 
111  static void __vectorcall StoreA16(float* p, f128arg value) NLIB_NOEXCEPT;
112  static void __vectorcall StoreA8(float* p, f128arg value) NLIB_NOEXCEPT;
113  static void __vectorcall StoreA4(float* p, f128arg value) NLIB_NOEXCEPT;
114  static void __vectorcall StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
115  static void __vectorcall StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
116  static void __vectorcall StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
117  static void __vectorcall StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT;
118  static void __vectorcall StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
119  static void __vectorcall StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
120 
121  static void __vectorcall StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT;
122  static void __vectorcall StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT;
123  static void __vectorcall StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
124  static void __vectorcall StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
125  static void __vectorcall StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
126  static void __vectorcall StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
127 
128  static void __vectorcall StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT;
129  static void __vectorcall StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT;
130  static void __vectorcall StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
131  static void __vectorcall StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
132  static void __vectorcall StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
133  static void __vectorcall StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
134 
135  /*
136  static f128 __vectorcall ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT;
137  static f128 __vectorcall ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT;
138  static f128 __vectorcall ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT;
139  */
140 
141 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
142  static f128 __vectorcall ConvertFromI128(i128 value) NLIB_NOEXCEPT;
143  static i128 __vectorcall ConvertToI128Round(f128 value) NLIB_NOEXCEPT;
144  static i128 __vectorcall ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT;
145 
146  static f128 __vectorcall CastFromI128(i128 value) NLIB_NOEXCEPT;
147  static i128 __vectorcall CastToI128(f128 value) NLIB_NOEXCEPT;
148 
149  template<int N>
150  static f128 __vectorcall ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT;
151  template<int N>
152  static i128 __vectorcall ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT;
153 #endif
154 
155  static f128 __vectorcall Add(f128arg a, f128arg b) NLIB_NOEXCEPT;
156  static f128 __vectorcall Sub(f128arg a, f128arg b) NLIB_NOEXCEPT;
157  static f128 __vectorcall Mult(f128arg a, f128arg b) NLIB_NOEXCEPT;
158  static f128 __vectorcall Mult(float a, f128arg b) NLIB_NOEXCEPT;
159  template <size_t N>
160  static f128 __vectorcall Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT;
161  static f128 __vectorcall Div(f128arg a, f128arg b) NLIB_NOEXCEPT;
162  static f128 __vectorcall Negate(f128arg value) NLIB_NOEXCEPT;
163  template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
164  static f128 __vectorcall NegateEx(f128arg value) NLIB_NOEXCEPT;
165  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
166  static f128 __vectorcall MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
167  template <size_t N>
168  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
170  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
171  static f128 __vectorcall MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
172  template <size_t N>
173  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
175  static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT;
176  static f128 __vectorcall Abs(f128arg value) NLIB_NOEXCEPT;
177  static f128 __vectorcall AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT;
178 
179  //
180  // Min/Max
181  //
182 
183  static f128 __vectorcall Max(f128arg a, f128arg b) NLIB_NOEXCEPT;
184  static f128 __vectorcall Min(f128arg a, f128arg b) NLIB_NOEXCEPT;
185  static f128 __vectorcall PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT;
186  static f128 __vectorcall PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT;
187  static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT;
188  static f128 __vectorcall Saturate(f128arg value) NLIB_NOEXCEPT;
189 
190  //
191  // Reciprocal/Sqrt
192  //
193 
194  static f128 __vectorcall Recp(f128arg value) NLIB_NOEXCEPT;
195  static f128 __vectorcall RecpEst(f128arg value) NLIB_NOEXCEPT;
196  static f128 __vectorcall Sqrt(f128arg value) NLIB_NOEXCEPT;
197  static f128 __vectorcall SqrtEst(f128arg value) NLIB_NOEXCEPT;
198  static f128 __vectorcall RecpSqrt(f128arg value) NLIB_NOEXCEPT;
199  static f128 __vectorcall RecpSqrtEst(f128arg value) NLIB_NOEXCEPT;
200 
201  //
202  // Round/Truncate
203  //
204 
205  static f128 __vectorcall Round(f128arg value) NLIB_NOEXCEPT;
206  static f128 __vectorcall Truncate(f128arg value) NLIB_NOEXCEPT;
207  static f128 __vectorcall Floor(f128arg value) NLIB_NOEXCEPT;
208  static f128 __vectorcall Ceil(f128arg value) NLIB_NOEXCEPT;
209 
210  //
211  // Logical Operations
212  //
213 
214  static f128 __vectorcall And(f128arg a, f128arg b) NLIB_NOEXCEPT;
215  static f128 __vectorcall Or(f128arg a, f128arg b) NLIB_NOEXCEPT;
216  static f128 __vectorcall Xor(f128arg a, f128arg b) NLIB_NOEXCEPT;
217  static f128 __vectorcall Not(f128arg a) NLIB_NOEXCEPT;
218  static f128 __vectorcall AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
219  static f128 __vectorcall OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
220 
221  //
222  // Comparison Operations
223  //
224 
225  static f128 __vectorcall CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT;
226  static f128 __vectorcall CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT;
227  static f128 __vectorcall CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT;
228  static f128 __vectorcall CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT;
229  static f128 __vectorcall CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT;
230  static f128 __vectorcall CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT;
231  static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT;
232  static f128 __vectorcall InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT;
233 
234  static f128 __vectorcall CmpEqZero(f128arg value) NLIB_NOEXCEPT;
235  static f128 __vectorcall CmpLtZero(f128arg value) NLIB_NOEXCEPT;
236  static f128 __vectorcall CmpLeZero(f128arg value) NLIB_NOEXCEPT;
237  static f128 __vectorcall CmpGtZero(f128arg value) NLIB_NOEXCEPT;
238  static f128 __vectorcall CmpGeZero(f128arg value) NLIB_NOEXCEPT;
239  static f128 __vectorcall CmpNeZero(f128arg value) NLIB_NOEXCEPT;
240  static f128 __vectorcall CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT;
241 
242  //
243  // Trigonometric function
244  //
245  static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
246  static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
247  static f128 __vectorcall ModAngle(f128arg value) NLIB_NOEXCEPT;
248  static f128 __vectorcall Sin(f128arg value) NLIB_NOEXCEPT;
249  static f128 __vectorcall Cos(f128arg value) NLIB_NOEXCEPT;
250  static f128x2 __vectorcall SinCos(f128arg value) NLIB_NOEXCEPT;
251  static f128 __vectorcall Tan(f128arg value) NLIB_NOEXCEPT;
252  static f128 __vectorcall SinH(f128arg value) NLIB_NOEXCEPT;
253  static f128 __vectorcall CosH(f128arg value) NLIB_NOEXCEPT;
254  static f128 __vectorcall TanH(f128arg value) NLIB_NOEXCEPT;
255  static f128 __vectorcall ArcSin(f128arg value) NLIB_NOEXCEPT;
256  static f128 __vectorcall ArcCos(f128arg value) NLIB_NOEXCEPT;
257  static f128 __vectorcall ArcTan(f128arg value) NLIB_NOEXCEPT;
258  static f128 __vectorcall ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT;
259  // and Est version needed?
260 
261  //
262  // Interpolation
263  //
264 
265  static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT;
266  static f128 __vectorcall
267  Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t) NLIB_NOEXCEPT;
268  static f128 __vectorcall
269  CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t) NLIB_NOEXCEPT;
270  static f128 __vectorcall
271  BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g) NLIB_NOEXCEPT;
272 
273  //
274  // Exp/Log
275  //
276  static f128 __vectorcall Exp2(f128arg value) NLIB_NOEXCEPT;
277  static f128 __vectorcall ExpE(f128arg value) NLIB_NOEXCEPT;
278  static f128 __vectorcall Log2(f128arg value) NLIB_NOEXCEPT;
279  static f128 __vectorcall LogE(f128arg value) NLIB_NOEXCEPT; // not implemented
280 
281  //
282  // Misc
283  //
284 
285  static int __vectorcall MoveMask(f128arg value) NLIB_NOEXCEPT;
286  static bool __vectorcall IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT;
287  static bool __vectorcall IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT;
288  static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT;
289  static f128 __vectorcall IsNaN(f128arg value) NLIB_NOEXCEPT;
290  static f128 __vectorcall IsInfinite(f128arg value) NLIB_NOEXCEPT;
291 
292  //
293  // Get/Set
294  //
295 
296  template <size_t N>
297  static float __vectorcall GetFloatFromLane(f128arg value) NLIB_NOEXCEPT;
298  template <size_t N>
299  static uint32_t __vectorcall GetUint32FromLane(f128arg value) NLIB_NOEXCEPT;
300  static float __vectorcall GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
301  static uint32_t __vectorcall GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
302 
303  template <size_t N>
304  static f128 __vectorcall SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT;
305  static f128 __vectorcall SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT;
306 
307  //
308  // Swizzle/Permute
309  //
310 
311  template <int V0, int V1, int V2, int V3>
312  static f128 __vectorcall Swizzle(f128arg value) NLIB_NOEXCEPT;
313  template <int V0, int V1, int V2, int V3>
314  static f128 __vectorcall Permute(f128arg a, f128arg b) NLIB_NOEXCEPT;
315  template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
316  static f128 __vectorcall Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT;
317 
318  template <size_t N>
319  // left <- value[3], value[2], value[1], value[0] -> right
320  static f128 __vectorcall RotateLeft(f128arg value) NLIB_NOEXCEPT {
321  NLIB_STATIC_ASSERT(N < 4);
322  const size_t NN = 4 - N;
323  return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
324  }
325  template <size_t N>
326  // left <- value[3], value[2], value[1], value[0] -> right
327  static f128 __vectorcall RotateRight(f128arg value) NLIB_NOEXCEPT {
328  NLIB_STATIC_ASSERT(N < 4);
329  return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
330  }
331  template <size_t N>
332  // left <- b[3], ..., b[0], a[3], ..., a[0] -> right
333  static f128 __vectorcall ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
334  NLIB_STATIC_ASSERT(N < 4);
335  return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
336  }
337 
338  private:
339  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v1000_[4]; // 1.f, 0.f, 0.f, 0.f
340  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0100_[4]; // 0.f, 1.f, 0.f, 0.f
341  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0010_[4]; // 0.f, 0.f, 1.f, 0.f
342  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0001_[4]; // 0.f, 0.f, 0.f, 1.f
343 
344  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float pi_values_[4]; // pi, -pi, 2pi, pi/2
345  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const uint32_t nan_inf_[4]; // nan, inf, -nan, -inf
346 
347  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R0_[4];
348  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R1_[4];
349 
350  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R0_[4];
351  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R1_[4];
352  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R2_[4];
353 
354  // Those coefficients are from Cephes Math Library
355  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_cvalue_[4];
356  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_coeff_[4];
357  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_cvalue_[4];
358  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_coeff_[4];
359  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan_coeff_[8];
360  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan2_cvalue_[4];
361 
362  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_P_[4];
363  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_Q_[4];
364  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tanh_cvalue_[4];
365  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_p_[4];
366  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_q_[4];
367  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_c_[4];
368  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float log2_PQ_[12];
369 
370  F128(); // forbidden
371  friend class Vector3;
372  friend class Vector4;
373  friend class Matrix;
374  friend class Plane;
375  friend class Quaternion;
376  friend class Sphere;
377  friend class AxisAlignedBox;
378  friend class OrientedBox;
379  friend class Frustum;
380 
381  friend class DistanceSq;
382  friend class Intersection;
383  friend class Containment;
384 };
385 
386 #ifndef NLIB_DOXYGEN
387 
388 #undef NLIB_M
389 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
390 #define NLIB_M2(tp) inline tp __vectorcall
391 
392 // r[i] = v
393 NLIB_M(f128) F128::SetValue(float v, each_float_tag) NLIB_NOEXCEPT {
394 #ifdef NLIB_F128_SIMD_NOUSE
395  f128 ret;
396  ret.vec.v[0] = v;
397  ret.vec.v[1] = v;
398  ret.vec.v[2] = v;
399  ret.vec.v[3] = v;
400  return ret;
401 #elif defined(NLIB_SSE41)
402  return _mm_set1_ps(v);
403 #elif defined(NLIB_NEON)
404  return vdupq_n_f32(v);
405 #elif defined(CAFE)
406  f128 ret;
407  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
408  return ret;
409 #endif
410 }
411 
412 // r[i] = *reinterpret_cast<float*>(&v)
413 NLIB_M(f128) F128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
414 #ifdef NLIB_F128_SIMD_NOUSE
415  f128 ret;
416  ret.vec.u[0] = v;
417  ret.vec.u[1] = v;
418  ret.vec.u[2] = v;
419  ret.vec.u[3] = v;
420  return ret;
421 #elif defined(NLIB_SSE41)
422  union {
423  float f32;
424  uint32_t u32;
425  } tmp;
426  tmp.u32 = v;
427  return _mm_set1_ps(tmp.f32);
428 #elif defined(NLIB_NEON)
429  uint32x4_t tmp = vdupq_n_u32(v);
430  return vreinterpretq_f32_u32(tmp);
431 #elif defined(CAFE)
432  union {
433  float f32;
434  uint32_t u32;
435  } tmp;
436  tmp.u32 = v;
437  f128 ret;
438  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
439  return ret;
440 #endif
441 }
442 
443 // r[0] = a, r[1] = b, r[2] = c, r[3] = d
444 NLIB_M(f128) F128::SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT {
445 #ifdef NLIB_F128_SIMD_NOUSE
446  f128 ret;
447  ret.vec.v[0] = a;
448  ret.vec.v[1] = b;
449  ret.vec.v[2] = c;
450  ret.vec.v[3] = d;
451  return ret;
452 #elif defined(NLIB_SSE41)
453  return _mm_set_ps(d, c, b, a);
454 #elif defined(NLIB_NEON)
455  union {
456  float f32[2];
457  uint64_t u64;
458  } tmp1, tmp2;
459  tmp1.f32[0] = a;
460  tmp1.f32[1] = b;
461  tmp2.f32[0] = c;
462  tmp2.f32[1] = d;
463  return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
464 #elif defined(CAFE)
465  f128 ret;
466  ret.vec.ps[0][0] = a;
467  ret.vec.ps[0][1] = b;
468  ret.vec.ps[1][0] = c;
469  ret.vec.ps[1][1] = d;
470  return ret;
471 #endif
472 }
473 
474 template <size_t N>
475 // r[i] = value[N]
476 NLIB_M(f128) F128::SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
477  NLIB_STATIC_ASSERT(N < 4);
478 #ifdef NLIB_F128_SIMD_NOUSE
479  f128 ret;
480  ret.vec.v[0] = value.vec.v[N];
481  ret.vec.v[1] = value.vec.v[N];
482  ret.vec.v[2] = value.vec.v[N];
483  ret.vec.v[3] = value.vec.v[N];
484  return ret;
485 #elif defined(NLIB_SSE41)
486  return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
487 #elif defined(NLIB_NEON)
488  float32x2_t tmp = vget_low_f32(value);
489  return vdupq_lane_f32(tmp, N);
490 #elif defined(CAFE)
491  f128 ret;
492  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
493  return ret;
494 #endif
495 }
496 
497 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
498 template <>
499 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
500  float32x2_t tmp = vget_high_f32(value);
501  return vdupq_lane_f32(tmp, 0);
502 }
503 template <>
504 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
505  float32x2_t tmp = vget_high_f32(value);
506  return vdupq_lane_f32(tmp, 1);
507 }
508 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
509 template <>
510 NLIB_M(f128) F128::SetValue<0>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
511  f128 ret;
512  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
513  return ret;
514 }
515 template <>
516 NLIB_M(f128) F128::SetValue<1>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
517  f128 ret;
518  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
519  return ret;
520 }
521 template <>
522 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
523  f128 ret;
524  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
525  return ret;
526 }
527 template <>
528 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
529  f128 ret;
530  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
531  return ret;
532 }
533 #endif
534 
535 // r[i] = 0.f
536 NLIB_M(f128) F128::SetZero() NLIB_NOEXCEPT {
537 #ifdef NLIB_F128_SIMD_NOUSE
538  f128 ret;
539  ret.vec.v[0] = 0;
540  ret.vec.v[1] = 0;
541  ret.vec.v[2] = 0;
542  ret.vec.v[3] = 0;
543  return ret;
544 #elif defined(NLIB_SSE41)
545  return _mm_setzero_ps();
546 #elif defined(NLIB_NEON)
547  return vdupq_n_f32(0);
548 #elif defined(CAFE)
549  f128 ret;
550  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
551  return ret;
552 #endif
553 }
554 
555 NLIB_M(f128) F128::Set1000() NLIB_NOEXCEPT {
556 #ifdef NLIB_F128_SIMD_NOUSE
557  f128 ret;
558  ret.vec.v[0] = 1.f;
559  ret.vec.v[1] = 0;
560  ret.vec.v[2] = 0;
561  ret.vec.v[3] = 0;
562  return ret;
563 #elif defined(NLIB_NEON)
564  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
565  float32x2_t x00 = vcreate_f32(0ULL);
566  return vcombine_f32(x10, x00);
567 #else
568  return F128::LoadA16(F128::v1000_);
569 #endif
570 }
571 
572 NLIB_M(f128) F128::Set0100() NLIB_NOEXCEPT {
573 #ifdef NLIB_F128_SIMD_NOUSE
574  f128 ret;
575  ret.vec.v[0] = 0;
576  ret.vec.v[1] = 1.f;
577  ret.vec.v[2] = 0;
578  ret.vec.v[3] = 0;
579  return ret;
580 #elif defined(NLIB_NEON)
581  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
582  float32x2_t x00 = vcreate_f32(0ULL);
583  return vcombine_f32(x01, x00);
584 #else
585  return F128::LoadA16(F128::v0100_);
586 #endif
587 }
588 
589 NLIB_M(f128) F128::Set0010() NLIB_NOEXCEPT {
590 #ifdef NLIB_F128_SIMD_NOUSE
591  f128 ret;
592  ret.vec.v[0] = 0;
593  ret.vec.v[1] = 0;
594  ret.vec.v[2] = 1.f;
595  ret.vec.v[3] = 0;
596  return ret;
597 #elif defined(NLIB_NEON)
598  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
599  float32x2_t x00 = vcreate_f32(0ULL);
600  return vcombine_f32(x00, x10);
601 #else
602  return F128::LoadA16(F128::v0010_);
603 #endif
604 }
605 
606 NLIB_M(f128) F128::Set0001() NLIB_NOEXCEPT {
607 #ifdef NLIB_F128_SIMD_NOUSE
608  f128 ret;
609  ret.vec.v[0] = 0;
610  ret.vec.v[1] = 0;
611  ret.vec.v[2] = 0;
612  ret.vec.v[3] = 1.f;
613  return ret;
614 #elif defined(NLIB_NEON)
615  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
616  float32x2_t x00 = vcreate_f32(0ULL);
617  return vcombine_f32(x00, x01);
618 #else
619  return F128::LoadA16(F128::v0001_);
620 #endif
621 }
622 
623 template <size_t N>
624 // r = value, r[N] = 0.f
625 NLIB_M(f128) F128::SetZeroToLane(f128arg value) NLIB_NOEXCEPT {
626  NLIB_STATIC_ASSERT(N < 4);
627 #ifdef NLIB_F128_SIMD_NOUSE
628  f128 ret = value;
629  ret.vec.v[N] = 0.f;
630  return ret;
631 #elif defined(NLIB_SSE41)
632  return _mm_insert_ps(value, value, 1 << N);
633 #elif defined(NLIB_NEON)
634  return F128::Permute<N == 0 ? 4 : 0,
635  N == 1 ? 5 : 1,
636  N == 2 ? 6 : 2,
637  N == 3 ? 7 : 3>(value, vdupq_n_f32(0.f));
638  // return vsetq_lane_f32(0.f, value, N);
639 #elif defined(CAFE)
640  f128 ret = value;
641  ret.vec.ps[N / 2][N % 2] = 0.f;
642  return ret;
643 #endif
644 }
645 
646 // r[i] = 1.f
647 NLIB_M(f128) F128::SetOne() NLIB_NOEXCEPT {
648  return F128::SetValue(1.f, each_float);
649 }
650 
651 // r[i] = -1.f
652 NLIB_M(f128) F128::SetNegativeOne() NLIB_NOEXCEPT {
653  return F128::SetValue(-1.f, each_float);
654 }
655 
656 // r[i] = 1.0e-7f
657 NLIB_M(f128) F128::SetEpsilon() NLIB_NOEXCEPT {
658  return F128::SetValue(1.0e-7f, each_float);
659 }
660 
661 // r[i] = 0x7F800000U
662 NLIB_M(f128) F128::SetInfinity() NLIB_NOEXCEPT {
663  return F128::SetValue(0x7F800000U, each_uint32);
664 }
665 
666 // r[i] = 0x7FC00000U
667 NLIB_M(f128) F128::SetNaN() NLIB_NOEXCEPT {
668  return F128::SetValue(0x7FC00000U, each_uint32);
669 }
670 
671 // r[i] = -0.f(0x80000000U)
672 NLIB_M(f128) F128::SetSignMask() NLIB_NOEXCEPT {
673  return F128::SetValue(-0.f, each_float);
674 }
675 
676 // r[i] = p[i], p is 16 bytes aligned
677 NLIB_M(f128) F128::LoadA16(const float* p) NLIB_NOEXCEPT {
678 #ifdef NLIB_F128_SIMD_NOUSE
679  f128 ret;
680  ret.vec.v[0] = p[0];
681  ret.vec.v[1] = p[1];
682  ret.vec.v[2] = p[2];
683  ret.vec.v[3] = p[3];
684  return ret;
685 #elif defined(NLIB_SSE41)
686  return _mm_load_ps(p);
687 #elif defined(NLIB_NEON)
688  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
689  uint64x2_t val = vld1q_u64(tmp);
690  return vreinterpretq_f32_u64(val);
691 #elif defined(CAFE)
692  f128 ret;
693  ret.vec.ps[0][0] = p[0];
694  ret.vec.ps[0][1] = p[1];
695  ret.vec.ps[1][0] = p[2];
696  ret.vec.ps[1][1] = p[3];
697  return ret;
698 #endif
699 }
700 
701 // r[i] = p[i], p is 4 bytes aligned
702 NLIB_M(f128) F128::LoadA4(const float* p) NLIB_NOEXCEPT {
703 #ifdef NLIB_F128_SIMD_NOUSE
704  return LoadA16(p);
705 #elif defined(NLIB_SSE41)
706  return _mm_loadu_ps(p);
707 #elif defined(NLIB_NEON)
708  return vld1q_f32(p);
709 #elif defined(CAFE)
710  f128 ret;
711  ret.vec.ps[0][0] = p[0];
712  ret.vec.ps[0][1] = p[1];
713  ret.vec.ps[1][0] = p[2];
714  ret.vec.ps[1][1] = p[3];
715  return ret;
716 #endif
717 }
718 
719 // r[i] = p[i], p is 8 bytes aligned
720 NLIB_M(f128) F128::LoadA8(const float* p) NLIB_NOEXCEPT {
721 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
722  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
723  uint64x2_t val = vld1q_u64(tmp);
724  return vreinterpretq_f32_u64(val);
725 #else
726  return LoadA4(p);
727 #endif
728 }
729 
730 // r[i] = p[i], p is 16 bytes aligned
731 NLIB_M(f128) F128::LoadA16(uintptr_t p) NLIB_NOEXCEPT {
732  return LoadA16(reinterpret_cast<const float*>(p));
733 }
734 
735 // r[i] = p[i], p is 8 bytes aligned
736 NLIB_M(f128) F128::LoadA8(uintptr_t p) NLIB_NOEXCEPT {
737  return LoadA8(reinterpret_cast<const float*>(p));
738 }
739 
740 // r[i] = p[i], p is 4 bytes aligned
741 NLIB_M(f128) F128::LoadA4(uintptr_t p) NLIB_NOEXCEPT {
742  return LoadA4(reinterpret_cast<const float*>(p));
743 }
744 
745 // r[i] = p[i], p is 16 bytes aligned
746 NLIB_M(f128) F128::LoadA16(intptr_t p) NLIB_NOEXCEPT {
747  return LoadA16(reinterpret_cast<const float*>(p));
748 }
749 
750 // r[i] = p[i], p is 8 bytes aligned
751 NLIB_M(f128) F128::LoadA8(intptr_t p) NLIB_NOEXCEPT {
752  return LoadA8(reinterpret_cast<const float*>(p));
753 }
754 
755 // r[i] = p[i], p is 4 bytes aligned
756 NLIB_M(f128) F128::LoadA4(intptr_t p) NLIB_NOEXCEPT {
757  return LoadA4(reinterpret_cast<const float*>(p));
758 }
759 
760 // p[i] = value[i], p is 16 bytes aligned
761 NLIB_M(void) F128::StoreA16(float* p, f128arg value) NLIB_NOEXCEPT {
762 #ifdef NLIB_F128_SIMD_NOUSE
763  p[0] = value.vec.v[0];
764  p[1] = value.vec.v[1];
765  p[2] = value.vec.v[2];
766  p[3] = value.vec.v[3];
767 #elif defined(NLIB_SSE41)
768  _mm_store_ps(p, value);
769 #elif defined(NLIB_NEON)
770  uint64x2_t tmp = vreinterpretq_u64_f32(value);
771  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
772 #elif defined(CAFE)
773  p[0] = value.vec.ps[0][0];
774  p[1] = value.vec.ps[0][1];
775  p[2] = value.vec.ps[1][0];
776  p[3] = value.vec.ps[1][1];
777 #endif
778 }
779 
780 // p[i] = value[i], p is 4 bytes aligned
781 NLIB_M(void) F128::StoreA4(float* p, f128arg value) NLIB_NOEXCEPT {
782 #ifdef NLIB_F128_SIMD_NOUSE
783  StoreA16(p, value);
784 #elif defined(NLIB_SSE41)
785  _mm_storeu_ps(p, value);
786 #elif defined(NLIB_NEON)
787  vst1q_f32(p, value);
788 #elif defined(CAFE)
789  p[0] = value.vec.ps[0][0];
790  p[1] = value.vec.ps[0][1];
791  p[2] = value.vec.ps[1][0];
792  p[3] = value.vec.ps[1][1];
793 #endif
794 }
795 
796 // p[i] = value[i], p is 8 bytes aligned
797 NLIB_M(void) F128::StoreA8(float* p, f128arg value) NLIB_NOEXCEPT {
798 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
799  uint64x2_t tmp = vreinterpretq_u64_f32(value);
800  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
801 #else
802  StoreA4(p, value);
803 #endif
804 }
805 
806 // p[i] = value[i], p is 16 bytes aligned
807 NLIB_M(void) F128::StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
808  StoreA16(reinterpret_cast<float*>(p), value);
809 }
810 
811 // p[i] = value[i], p is 8 bytes aligned
812 NLIB_M(void) F128::StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
813  StoreA8(reinterpret_cast<float*>(p), value);
814 }
815 
816 // p[i] = value[i], p is 4 bytes aligned
817 NLIB_M(void) F128::StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
818  StoreA4(reinterpret_cast<float*>(p), value);
819 }
820 
821 // p[i] = value[i], p is 16 bytes aligned
822 NLIB_M(void) F128::StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT {
823  StoreA16(reinterpret_cast<float*>(p), value);
824 }
825 
826 // p[i] = value[i], p is 8 bytes aligned
827 NLIB_M(void) F128::StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
828  StoreA8(reinterpret_cast<float*>(p), value);
829 }
830 
831 // p[i] = value[i], p is 4 bytes aligned
832 NLIB_M(void) F128::StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
833  StoreA4(reinterpret_cast<float*>(p), value);
834 }
835 
836 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
837 NLIB_M(void) F128::StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT {
838 #ifdef NLIB_F128_SIMD_NOUSE
839  p[0] = value.vec.v[0];
840  p[1] = value.vec.v[1];
841 #elif defined(NLIB_SSE41)
842  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
843 #elif defined(NLIB_NEON)
844  uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
845  vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
846 #elif defined(CAFE)
847  p[0] = value.vec.ps[0][0];
848  p[1] = value.vec.ps[0][1];
849 #endif
850 }
851 
852 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
853 NLIB_M(void) F128::StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT {
854 #ifdef NLIB_F128_SIMD_NOUSE
855  p[0] = value.vec.v[0];
856  p[1] = value.vec.v[1];
857 #elif defined(NLIB_SSE41)
858  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
859 #elif defined(NLIB_NEON)
860  float32x2_t tmp = vget_low_f32(value);
861  vst1_f32(p, tmp);
862 #elif defined(CAFE)
863  p[0] = value.vec.ps[0][0];
864  p[1] = value.vec.ps[0][1];
865 #endif
866 }
867 
868 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
869 NLIB_M(void) F128::StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
870  StoreLoA8(reinterpret_cast<float*>(p), value);
871 }
872 
873 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
874 NLIB_M(void) F128::StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
875  StoreLoA4(reinterpret_cast<float*>(p), value);
876 }
877 
878 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
879 NLIB_M(void) F128::StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
880  StoreLoA8(reinterpret_cast<float*>(p), value);
881 }
882 
883 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
884 NLIB_M(void) F128::StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
885  StoreLoA4(reinterpret_cast<float*>(p), value);
886 }
887 
888 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
889 NLIB_M(void) F128::StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT {
890 #ifdef NLIB_F128_SIMD_NOUSE
891  p[0] = value.vec.v[2];
892  p[1] = value.vec.v[3];
893 #elif defined(NLIB_SSE41)
894  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
895 #elif defined(NLIB_NEON)
896  vst1_f32(p, vget_high_f32(value));
897 #elif defined(CAFE)
898  p[0] = value.vec.ps[1][0];
899  p[1] = value.vec.ps[1][1];
900 #endif
901 }
902 
903 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
904 NLIB_M(void) F128::StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT {
905 #ifdef NLIB_F128_SIMD_NOUSE
906  p[0] = value.vec.v[2];
907  p[1] = value.vec.v[3];
908 #elif defined(NLIB_SSE41)
909  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
910 #elif defined(NLIB_NEON)
911  float32x2_t tmp = vget_high_f32(value);
912  vst1_f32(p, tmp);
913 #elif defined(CAFE)
914  p[0] = value.vec.ps[1][0];
915  p[1] = value.vec.ps[1][1];
916 #endif
917 }
918 
919 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
920 NLIB_M(void) F128::StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
921  StoreHiA8(reinterpret_cast<float*>(p), value);
922 }
923 
924 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
925 NLIB_M(void) F128::StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
926  StoreHiA4(reinterpret_cast<float*>(p), value);
927 }
928 
929 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
930 NLIB_M(void) F128::StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
931  StoreHiA8(reinterpret_cast<float*>(p), value);
932 }
933 
934 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
935 NLIB_M(void) F128::StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
936  StoreHiA4(reinterpret_cast<float*>(p), value);
937 }
938 
939 // r[i] = fabs(value[i])
940 NLIB_M(f128) F128::Abs(f128arg value) NLIB_NOEXCEPT {
941 #ifdef NLIB_F128_SIMD_NOUSE
942  f128 ret;
943  ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
944  ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
945  ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
946  ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
947  return ret;
948 #elif defined(NLIB_NEON)
949  return vabsq_f32(value);
950 #elif defined(NLIB_SSE41)
951  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
952  return _mm_andnot_ps(signmask, value);
953 #elif defined(CAFE)
954  f128 ret;
955  ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
956  ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
957  return ret;
958 #endif
959 }
960 
961 // r[i] = mask[i] ? a[i] : b[i]
962 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT {
963 #ifdef NLIB_F128_SIMD_NOUSE
964  f128 result;
965  result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
966  result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
967  result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
968  result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
969  return result;
970 #elif defined(NLIB_SSE41)
971  return _mm_blendv_ps(b, a, mask);
972 #elif defined(NLIB_NEON)
973  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
974 #elif defined(CAFE)
975  // avoid NaN
976  f128 mask_ = mask;
977  mask_.vec.u[0] &= 0xFF7FFFFFUL;
978  mask_.vec.u[1] &= 0xFF7FFFFFUL;
979  mask_.vec.u[2] &= 0xFF7FFFFFUL;
980  mask_.vec.u[3] &= 0xFF7FFFFFUL;
981  // mask_ < 0 ? a : b
982  f128 ret;
983  ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
984  ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
985  return ret;
986 #endif
987 }
988 
989 /*
990 NLIB_M(f128) F128::ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT {
991 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
992  float16x4_t lo = vget_low_f16(vreinterpretq_f16_f32(value));
993  return vcvt_f32_f16(lo);
994 #else
995  // _mm_cvtph_ps
996  (void)value;
997  return F128::SetZero();
998 #endif
999 }
1000 
1001 NLIB_M(f128) F128::ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT {
1002 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1003 #ifdef __aarch64__
1004  return vcvt_high_f32_f16(value);
1005 #else
1006  float16x4_t hi = vget_high_f16(vreinterpretq_f16_f32(value));
1007  return vcvt_f32_f16(hi);
1008 #endif
1009 #else
1010  // _mm_cvtph_ps
1011  (void)value;
1012  return F128::SetZero();
1013 #endif
1014 }
1015 
1016 NLIB_M(f128) F128::ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT {
1017 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1018 #ifdef __aarch64__
1019  float16x4_t lo = vcvt_f16_f32(a);
1020  float16x8_t x = vcvt_high_f16_f32(lo, b);
1021  return vreinterpretq_f32_f16(x);
1022 #else
1023  float16x4_t lo = vcvt_f16_f32(a);
1024  float16x4_t hi = vcvt_f16_f32(b);
1025  return vreinterpretq_f32_f16(vcombine_f16(lo, hi));
1026 #endif
1027 #else
1028  // _mm_cvtps_ph
1029  (void)a;
1030  (void)b;
1031  return F128::SetZero();
1032 #endif
1033 }
1034 */
1035 
1036 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
1037 // r[i] = static_cast<float>(value[i])
1038 NLIB_M(f128) F128::ConvertFromI128(i128 value) NLIB_NOEXCEPT {
1039 #if defined(NLIB_SSE41)
1040  return _mm_cvtepi32_ps(value);
1041 #elif defined(NLIB_NEON)
1042  return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1043 #endif
1044 }
1045 
1046 // r[i] = *reinterpret_cast<float*>(&value[i])
1047 NLIB_M(f128) F128::CastFromI128(i128 value) NLIB_NOEXCEPT {
1048 #if defined(NLIB_SSE41)
1049  return _mm_castsi128_ps(value);
1050 #elif defined(NLIB_NEON)
1051  return vreinterpretq_f32_s8(value);
1052 #endif
1053 }
1054 
1055 // r[i] = static_cast<int>(roundf(value[i]))
1056 NLIB_M(i128) F128::ConvertToI128Round(f128 value) NLIB_NOEXCEPT {
1057 #if defined(NLIB_SSE41)
1058  return _mm_cvtps_epi32(value);
1059 #elif defined(NLIB_NEON)
1060  uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1061  uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1062  uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1063  w = vorrq_u32(w, half);
1064  return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1065 #endif
1066 }
1067 
1068 NLIB_M(i128) F128::ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT {
1069 #if defined(NLIB_SSE41)
1070  return _mm_cvttps_epi32(value);
1071 #elif defined(NLIB_NEON)
1072  return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1073 #endif
1074 }
1075 
1076 // r[i] = *reinterpret_cast<int*>(&value[i])
1077 NLIB_M(i128) F128::CastToI128(f128 value) NLIB_NOEXCEPT {
1078 #if defined(NLIB_SSE41)
1079  return _mm_castps_si128(value);
1080 #elif defined(NLIB_NEON)
1081  return vreinterpretq_s8_f32(value);
1082 #endif
1083 }
1084 
1085 template<int N>
1086 NLIB_M(f128) F128::ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT {
1087  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1088 #if defined(NLIB_NEON)
1089  return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1090 #else
1091  f128 f = F128::ConvertFromI128(value);
1092  f128 m = F128::SetValue(((0x7F - N) << 23), each_uint32); // 0.5f^N
1093  return F128::Mult(f, m);
1094 #endif
1095 }
1096 
1097 template<int N>
1098 NLIB_M(i128) F128::ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT {
1099  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1100 #if defined(NLIB_NEON)
1101  return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1102 #else
1103  f128 m = F128::SetValue(((0x7F + N) << 23), each_uint32); // 2.f^N
1104  f128 f = F128::Mult(value, m);
1105  return F128::ConvertToI128Truncate(f);
1106 #endif
1107 }
1108 
1109 #endif
1110 
1111 // r[i] = (a[i] < b[i]) ? 0xFFFFFFFF : 0
1112 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1113 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1114  f128 ret;
1115  ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1116  ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1117  ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1118  ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1119  return ret;
1120 #elif defined(NLIB_SSE41)
1121  return _mm_cmplt_ps(a, b);
1122 #elif defined(NLIB_NEON)
1123  uint32x4_t tmp = vcltq_f32(a, b);
1124  return vreinterpretq_f32_u32(tmp);
1125 #endif
1126 }
1127 
1128 // r[i] = (a[i] <= b[i]) ? 0xFFFFFFFF : 0
1129 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1130 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1131  f128 ret;
1132  ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1133  ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1134  ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1135  ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1136  return ret;
1137 #elif defined(NLIB_SSE41)
1138  return _mm_cmple_ps(a, b);
1139 #elif defined(NLIB_NEON)
1140  uint32x4_t tmp = vcleq_f32(a, b);
1141  return vreinterpretq_f32_u32(tmp);
1142 #endif
1143 }
1144 
1145 // r[i] = (a[i] > b[i]) ? 0xFFFFFFFF : 0
1146 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1147 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1148  f128 ret;
1149  ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1150  ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1151  ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1152  ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1153  return ret;
1154 #elif defined(NLIB_SSE41)
1155  return _mm_cmpgt_ps(a, b);
1156 #elif defined(NLIB_NEON)
1157  uint32x4_t tmp = vcgtq_f32(a, b);
1158  return vreinterpretq_f32_u32(tmp);
1159 #endif
1160 }
1161 
1162 // r[i] = (a[i] >= b[i]) ? 0xFFFFFFFF : 0
1163 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1164 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1165  f128 ret;
1166  ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1167  ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1168  ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1169  ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1170  return ret;
1171 #elif defined(NLIB_SSE41)
1172  return _mm_cmpge_ps(a, b);
1173 #elif defined(NLIB_NEON)
1174  uint32x4_t tmp = vcgeq_f32(a, b);
1175  return vreinterpretq_f32_u32(tmp);
1176 #endif
1177 }
1178 
1179 // r[i] = (a[i] != b[i]) ? 0xFFFFFFFF : 0
1180 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1181 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1182  f128 ret;
1183  ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1184  ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1185  ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1186  ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1187  return ret;
1188 #elif defined(NLIB_SSE41)
1189  return _mm_cmpneq_ps(a, b);
1190 #elif defined(NLIB_NEON)
1191  uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1192  return vreinterpretq_f32_u32(tmp);
1193 #endif
1194 }
1195 
1196 // r[i] = a[i] + b[i]
1197 NLIB_M(f128) F128::Add(f128arg a, f128arg b) NLIB_NOEXCEPT {
1198 #ifdef NLIB_F128_SIMD_NOUSE
1199  f128 ret;
1200  ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1201  ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1202  ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1203  ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1204  return ret;
1205 #elif defined(NLIB_SSE41)
1206  return _mm_add_ps(a, b);
1207 #elif defined(NLIB_NEON)
1208  return vaddq_f32(a, b);
1209 #elif defined(CAFE)
1210  f128 ret;
1211  ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1212  ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1213  return ret;
1214 #endif
1215 }
1216 
1217 // r[i] = a[i] - b[i]
1218 NLIB_M(f128) F128::Sub(f128arg a, f128arg b) NLIB_NOEXCEPT {
1219 #ifdef NLIB_F128_SIMD_NOUSE
1220  f128 ret;
1221  ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1222  ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1223  ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1224  ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1225  return ret;
1226 #elif defined(NLIB_SSE41)
1227  return _mm_sub_ps(a, b);
1228 #elif defined(NLIB_NEON)
1229  return vsubq_f32(a, b);
1230 #elif defined(CAFE)
1231  f128 ret;
1232  ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1233  ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1234  return ret;
1235 #endif
1236 }
1237 
1238 // r[i] = -value[i]
1239 NLIB_M(f128) F128::Negate(f128arg value) NLIB_NOEXCEPT {
1240 #ifdef NLIB_F128_SIMD_NOUSE
1241  f128 ret;
1242  ret.vec.v[0] = -value.vec.v[0];
1243  ret.vec.v[1] = -value.vec.v[1];
1244  ret.vec.v[2] = -value.vec.v[2];
1245  ret.vec.v[3] = -value.vec.v[3];
1246  return ret;
1247 #elif defined(NLIB_NEON)
1248  return vnegq_f32(value);
1249 #elif defined(NLIB_SSE41)
1250  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
1251  return _mm_xor_ps(signmask, value);
1252 #elif defined(CAFE)
1253  f128 ret;
1254  ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1255  ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1256  return ret;
1257 #endif
1258 }
1259 
1260 // r[i] = a[i] * b[i]
1261 NLIB_M(f128) F128::Mult(f128arg a, f128arg b) NLIB_NOEXCEPT {
1262 #ifdef NLIB_F128_SIMD_NOUSE
1263  f128 ret;
1264  ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1265  ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1266  ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1267  ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1268  return ret;
1269 #elif defined(NLIB_SSE41)
1270  return _mm_mul_ps(a, b);
1271 #elif defined(NLIB_NEON)
1272  return vmulq_f32(a, b);
1273 #elif defined(CAFE)
1274  f128 ret;
1275  ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1276  ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1277  return ret;
1278 #endif
1279 }
1280 
1281 // r[i] = a * b[i]
1282 NLIB_M(f128) F128::Mult(float a, f128arg b) NLIB_NOEXCEPT {
1283 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1284  return vmulq_n_f32(b, a);
1285 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1286  f128 ret;
1287  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1288  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1289  return ret;
1290 #else
1291  return F128::Mult(b, F128::SetValue(a, each_float));
1292 #endif
1293 }
1294 
1295 template <size_t N>
1296 // r[i] = a[N] * b[i]
1297 NLIB_M(f128) F128::Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT {
1298 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1299 # if __aarch64__
1300  return vmulq_laneq_f32(b, a, N);
1301 # else
1302  float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1303  return vmulq_n_f32(b, tmp);
1304 # endif
1305 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1306  float t = a.vec.ps[N / 2][N % 2];
1307  f128 ret;
1308  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1309  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1310  return ret;
1311 #else
1312  return F128::Mult(F128::SetValue<N>(a, each_select32), b);
1313 #endif
1314 }
1315 
1316 // r[i] = a[i] / b[i]
1317 NLIB_M(f128) F128::Div(f128arg a, f128arg b) NLIB_NOEXCEPT {
1318 #ifdef NLIB_F128_SIMD_NOUSE
1319  f128 ret;
1320  ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1321  ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1322  ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1323  ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1324  return ret;
1325 #elif defined(NLIB_SSE41)
1326  return _mm_div_ps(a, b);
1327 #elif defined(NLIB_NEON)
1328 # ifdef __aarch64__
1329  return vdivq_f32(a, b);
1330 # else
1331  float32x4_t inv0 = vrecpeq_f32(b);
1332  float32x4_t step0 = vrecpsq_f32(inv0, b);
1333  float32x4_t inv1 = vmulq_f32(step0, inv0);
1334  float32x4_t step1 = vrecpsq_f32(inv1, b);
1335  float32x4_t inv2 = vmulq_f32(step1, inv1);
1336  uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1337  inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1338  return vmulq_f32(a, inv2);
1339 # endif
1340 #elif defined(CAFE)
1341  f128 ret;
1342  ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1343  ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1344  return ret;
1345 #endif
1346 }
1347 
1348 // r[i] = max(a[i], b[i])
1349 NLIB_M(f128) F128::Max(f128arg a, f128arg b) NLIB_NOEXCEPT {
1350 #ifdef NLIB_F128_SIMD_NOUSE
1351  f128 ret;
1352  ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1353  ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1354  ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1355  ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1356  return ret;
1357 #elif defined(NLIB_SSE41)
1358  return _mm_max_ps(a, b);
1359 #elif defined(NLIB_NEON)
1360  return vmaxq_f32(a, b);
1361 #elif defined(CAFE)
1362  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1363  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1364  f128 ret;
1365  ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1366  ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1367  return ret;
1368 #endif
1369 }
1370 
1371 // r[i] = min(a[i], b[i])
1372 NLIB_M(f128) F128::Min(f128arg a, f128arg b) NLIB_NOEXCEPT {
1373 #ifdef NLIB_F128_SIMD_NOUSE
1374  f128 ret;
1375  ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1376  ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1377  ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1378  ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1379  return ret;
1380 #elif defined(NLIB_SSE41)
1381  return _mm_min_ps(a, b);
1382 #elif defined(NLIB_NEON)
1383  return vminq_f32(a, b);
1384 #elif defined(CAFE)
1385  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1386  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1387  f128 ret;
1388  ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1389  ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1390  return ret;
1391 #endif
1392 }
1393 
1394 // r[0] = max(a[0], a[1]), r[1] = max(a[2], a[3]), r[2] = max(b[0], b[1]), r[3] = max(b[2], b[3])
1395 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT {
1396 #ifdef NLIB_F128_SIMD_NOUSE
1397  f128 ret;
1398  ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1399  ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1400  ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1401  ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1402  return ret;
1403 #elif defined(NLIB_SSE41)
1404  f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1405  f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1406  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1407 #elif defined(NLIB_NEON)
1408 # ifdef __aarch64__
1409  return vpmaxq_f32(a, b);
1410 # else
1411  float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1412  float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1413  return vcombine_f32(rl, rh);
1414 # endif
1415 #elif defined(CAFE)
1416  f32x2 v02, v13, cmp;
1417  f128 ret;
1418  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1419  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1420  cmp = __PS_SUB(v02, v13);
1421  ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1422  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1423  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1424  cmp = __PS_SUB(v02, v13);
1425  ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1426  return ret;
1427 #endif
1428 }
1429 
1430 // r[0] = min(a[0], a[1]), r[1] = min(a[2], a[3]), r[2] = min(b[0], b[1]), r[3] = min(b[2], b[3])
1431 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT {
1432 #ifdef NLIB_F128_SIMD_NOUSE
1433  f128 ret;
1434  ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1435  ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1436  ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1437  ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1438  return ret;
1439 #elif defined(NLIB_SSE41)
1440  f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1441  f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1442  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1443 #elif defined(NLIB_NEON)
1444 # ifdef __aarch64__
1445  return vpminq_f32(a, b);
1446 # else
1447  float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1448  float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1449  return vcombine_f32(rl, rh);
1450 # endif
1451 #elif defined(CAFE)
1452  f32x2 v02, v13, cmp;
1453  f128 ret;
1454  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1455  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1456  cmp = __PS_SUB(v02, v13);
1457  ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1458  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1459  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1460  cmp = __PS_SUB(v02, v13);
1461  ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1462  return ret;
1463 #endif
1464 }
1465 
1466 // r[0] = a[0] + a[1], r[1] = a[2] + a[3], ...
1467 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT {
1468 #ifdef NLIB_F128_SIMD_NOUSE
1469  f128 ret;
1470  ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1471  ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1472  ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1473  ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1474  return ret;
1475 #elif defined(NLIB_SSE41)
1476  return _mm_hadd_ps(a, b);
1477 #elif defined(NLIB_NEON)
1478 # ifdef __aarch64__
1479  return vpaddq_f32(a, b);
1480 # else
1481  float32x2_t al = vget_low_f32(a);
1482  float32x2_t ah = vget_high_f32(a);
1483  float32x2_t l = vpadd_f32(al, ah);
1484 
1485  float32x2_t bl = vget_low_f32(b);
1486  float32x2_t bh = vget_high_f32(b);
1487  float32x2_t h = vpadd_f32(bl, bh);
1488  return vcombine_f32(l, h);
1489 # endif
1490 #elif defined(CAFE)
1491  f32x2 v02, v13, cmp;
1492  f128 ret;
1493  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1494  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1495  ret.vec.ps[0] = __PS_ADD(v02, v13);
1496  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1497  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1498  ret.vec.ps[1] = __PS_ADD(v02, v13);
1499  return ret;
1500 #endif
1501 }
1502 
1503 // r[i] = fabs(a[i] - b[i])
1504 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT {
1505 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1506  return vabdq_f32(a, b);
1507 #else
1508  return F128::Abs(F128::Sub(a, b));
1509 #endif
1510 }
1511 
1512 // r[i] = c[i] + a[i] * b[i]
1513 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1514 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1515 # if __aarch64__
1516  return vfmaq_f32(c, a, b);
1517 # else
1518  return vmlaq_f32(c, a, b);
1519 # endif
1520 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1521  f128 ret;
1522  ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1523  ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1524  return ret;
1525 #else
1526  return F128::Add(c, F128::Mult(a, b));
1527 #endif
1528 }
1529 
1530 // r[i] = c[i] + a * b[i]
1531 NLIB_M(f128) F128::MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1532 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1533 # if __aarch64__
1534  return vfmaq_n_f32(c, b, a);
1535 # else
1536  return vmlaq_n_f32(c, b, a);
1537 # endif
1538 #else
1539  return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1540 #endif
1541 }
1542 
1543 template <size_t N>
1544 // r[i] = c[i] + a[N] * b[i]
1545 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1547  NLIB_STATIC_ASSERT(N < 4);
1548 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1549 # if __aarch64__
1550  return vfmaq_laneq_f32(c, b, a, N);
1551 # else
1552  return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1553 # endif
1554 #else
1555  return F128::MultAdd(F128::SetValue<N>(a, each_select32), b, c);
1556 #endif
1557 }
1558 
1559 // r[i] = c[i] - a[i] * b[i]
1560 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1561 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1562 # if __aarch64__
1563  return vfmsq_f32(c, a, b);
1564 # else
1565  return vmlsq_f32(c, a, b);
1566 # endif
1567 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1568  f128 ret;
1569  ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1570  ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1571  return ret;
1572 #else
1573  return F128::Sub(c, F128::Mult(a, b));
1574 #endif
1575 }
1576 
1577 // r[i] = c[i] - a * b[i]
1578 NLIB_M(f128) F128::MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1579 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1580 # if __aarch64__
1581  return vfmsq_n_f32(c, b, a);
1582 # else
1583  return vmlsq_n_f32(c, b, a);
1584 # endif
1585 #else
1586  return F128::MultSub(F128::SetValue(a, each_float), b, c);
1587 #endif
1588 }
1589 
1590 template <size_t N>
1591 // r[i] = c[i] - a[N] * b[i]
1592 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1594  NLIB_STATIC_ASSERT(N < 4);
1595 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1596 # ifdef __arch64__
1597  return vfmsq_laneq_f32(c, b, a, N);
1598 # else
1599  return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1600 # endif
1601 #else
1602  return F128::MultSub(F128::SetValue<N>(a, each_select32), b, c);
1603 #endif
1604 }
1605 
1606 // r[i] = a[i] + t[i] * (b[i] - a[i])
1607 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT {
1608  // a + t * (b - a)
1609  return F128::MultAdd(t, F128::Sub(b, a), a);
1610 }
1611 
1612 // r[i] = a[i] & b[i]
1613 NLIB_M(f128) F128::And(f128arg a, f128arg b) NLIB_NOEXCEPT {
1614 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1615  f128 ret;
1616  ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1617  ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1618  ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1619  ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1620  return ret;
1621 #elif defined(NLIB_SSE41)
1622  return _mm_and_ps(a, b);
1623 #elif defined(NLIB_NEON)
1624  uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1625  return vreinterpretq_f32_u32(tmp);
1626 #endif
1627 }
1628 
1629 // -pi <= angle1, angle2, r < pi
1630 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1631  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1632  // -pi <= ret < pi
1633  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1634  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1635 
1636  f128 sum = F128::Add(angle1, angle2);
1637  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1638  f128 ofs = F128::And(cond, pi_dbl);
1639  f128 result = F128::Add(sum, ofs);
1640  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1641  ofs = F128::And(cond, pi_dbl);
1642  return F128::Sub(result, ofs);
1643 }
1644 
1645 // -pi <= angle1, angle2, r < pi
1646 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1647  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1648  // -pi <= ret < pi
1649  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1650  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1651 
1652  f128 sum = F128::Sub(angle1, angle2);
1653  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1654  f128 ofs = F128::And(cond, pi_dbl);
1655  f128 result = F128::Add(sum, ofs);
1656  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1657  ofs = F128::And(cond, pi_dbl);
1658  return F128::Sub(result, ofs);
1659 }
1660 
1661 // ( 2 1 -2 1) (p0)
1662 // (t^3 t^2 t 1) (-3 -2 3 -1) (v0)
1663 // ( 0 1 0 0) (p1)
1664 // ( 1 0 0 0) (v1)
1665 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1666  f128arg_ex t) NLIB_NOEXCEPT {
1667  // (2 * p0 + v0 - 2 * p1 + v1) * t^3 + (-3 * p0 - 2 * v0 + 3 * p1 - v1) * t^2
1668  // + v0 * t + p0
1669  // ==
1670  // (2 * t^3 - 3 * t^2 + 1) * p0 + (t^3 - 2 * t^2 + t) * v0
1671  // + (-2 * t^3 + 3 * t^2) * p1 + (t^3 - t^2) * v1
1672  f128 tt = F128::Mult(t, t);
1673  f128 ttt = F128::Mult(tt, t);
1674 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1675  f128 hermite_R0 = vcombine_f32(vcreate_f32(0x3F80000040000000ULL),
1676  vcreate_f32(0x3F800000C0000000ULL));
1677  f128 hermite_R1 = vcombine_f32(vcreate_f32(0xC0000000C0400000ULL),
1678  vcreate_f32(0xBF80000040400000ULL));
1679 #else
1680  f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1681  f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1682 #endif
1683 
1684  ttt = F128::Mult(ttt, hermite_R0);
1685  ttt = F128::MultAdd(tt, hermite_R1, ttt);
1686  ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1687  ttt = F128::Add(ttt, F128::Set1000());
1688 
1689  // vec(ttt) * mtx(p0, v0, p1, v1)
1690  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1691  result = F128::MultAdd<1>(ttt, v0, result, each_select32);
1692  result = F128::MultAdd<2>(ttt, p1, result, each_select32);
1693  result = F128::MultAdd<3>(ttt, v1, result, each_select32);
1694  return result;
1695 }
1696 
1697 // (-1 3 -3 1) (p0)
1698 // 0.5 * (t^3 t^2 t 1) ( 2 -5 4 -1) (p1)
1699 // (-1 0 1 0) (p2)
1700 // ( 0 2 0 0) (p3)
1701 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1702  f128arg_ex t) NLIB_NOEXCEPT {
1703  f128 tt = F128::Mult(t, t);
1704  f128 ttt = F128::Mult(tt, t);
1705 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1706  f128 catmull_R0 = vcombine_f32(vcreate_f32(0x40400000BF800000ULL),
1707  vcreate_f32(0x3F800000C0400000ULL));
1708  f128 catmull_R1 = vcombine_f32(vcreate_f32(0xC0A0000040000000ULL),
1709  vcreate_f32(0xBF80000040800000ULL));
1710  f128 catmull_R2 = vcombine_f32(vcreate_f32(0x00000000BF800000ULL),
1711  vcreate_f32(0x000000003F800000ULL));
1712 #else
1713  f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1714  f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1715  f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1716 #endif
1717  ttt = F128::Mult(ttt, catmull_R0);
1718  ttt = F128::MultAdd(tt, catmull_R1, ttt);
1719  ttt = F128::MultAdd(t, catmull_R2, ttt);
1720  ttt = F128::Add(ttt, F128::Set0100());
1721 
1722  // vec(ttt) * mtx(p0, p1, p2, p3)
1723  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1724  result = F128::MultAdd<1>(ttt, p1, result, each_select32);
1725  result = F128::MultAdd<2>(ttt, p2, result, each_select32);
1726  result = F128::MultAdd<3>(ttt, p3, result, each_select32);
1727 
1728  return result;
1729 }
1730 
1731 // p0 + f * (p1 - p0) + g * (p2 - p0)
1732 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1733  f128arg_ex g) NLIB_NOEXCEPT {
1734  f128 p1p0 = F128::Sub(p1, p0);
1735  f128 p2p0 = F128::Sub(p2, p0);
1736  f128 tmp = F128::MultAdd(f, p1p0, p0);
1737  return F128::MultAdd(g, p2p0, tmp);
1738 }
1739 
1740 // r[i] = a[i] | b[i]
1741 NLIB_M(f128) F128::Or(f128arg a, f128arg b) NLIB_NOEXCEPT {
1742 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1743  f128 ret;
1744  ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1745  ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1746  ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1747  ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1748  return ret;
1749 #elif defined(NLIB_SSE41)
1750  return _mm_or_ps(a, b);
1751 #elif defined(NLIB_NEON)
1752  uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1753  return vreinterpretq_f32_u32(tmp);
1754 #endif
1755 }
1756 
1757 // r[i] = a[i] ^ b[i]
1758 NLIB_M(f128) F128::Xor(f128arg a, f128arg b) NLIB_NOEXCEPT {
1759 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1760  f128 ret;
1761  ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1762  ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1763  ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1764  ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1765  return ret;
1766 #elif defined(NLIB_SSE41)
1767  return _mm_xor_ps(a, b);
1768 #elif defined(NLIB_NEON)
1769  uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1770  return vreinterpretq_f32_u32(tmp);
1771 #endif
1772 }
1773 
1774 // r[i] = ~a[i]
1775 NLIB_M(f128) F128::Not(f128arg a) NLIB_NOEXCEPT {
1776 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1777  f128 ret;
1778  ret.vec.u[0] = ~a.vec.u[0];
1779  ret.vec.u[1] = ~a.vec.u[1];
1780  ret.vec.u[2] = ~a.vec.u[2];
1781  ret.vec.u[3] = ~a.vec.u[3];
1782  return ret;
1783 #elif defined(NLIB_SSE41)
1784  return _mm_andnot_ps(a, F128::CmpEq(a, a));
1785 #elif defined(NLIB_NEON)
1786  uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1787  return vreinterpretq_f32_u32(tmp);
1788 #endif
1789 }
1790 
1791 // r[i] = ~a[i] & b[i]
1792 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1793 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1794  f128 ret;
1795  ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1796  ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1797  ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1798  ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1799  return ret;
1800 #elif defined(NLIB_SSE41)
1801  return _mm_andnot_ps(a, b);
1802 #elif defined(NLIB_NEON)
1803  uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1804  return vreinterpretq_f32_u32(tmp);
1805 #endif
1806 }
1807 
1808 // r[i] = ~a[i] | b[i]
1809 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1810 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1811  f128 ret;
1812  ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1813  ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1814  ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1815  ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1816  return ret;
1817 #elif defined(NLIB_SSE41)
1818  return _mm_or_ps(F128::Not(a), b);
1819 #elif defined(NLIB_NEON)
1820  uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1821  return vreinterpretq_f32_u32(tmp);
1822 #endif
1823 }
1824 
1825 // r[i] = (a[i] == b[i]) ? 0xFFFFFFFF : 0
1826 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT {
1827 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1828  f128 ret;
1829  ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1830  ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1831  ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1832  ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1833  return ret;
1834 #elif defined(NLIB_SSE41)
1835  return _mm_cmpeq_ps(a, b);
1836 #elif defined(NLIB_NEON)
1837  uint32x4_t tmp = vceqq_f32(a, b);
1838  return vreinterpretq_f32_u32(tmp);
1839 #endif
1840 }
1841 
1842 // r[i] = (absf(a[i] - b[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1843 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT {
1844  f128 tmp = F128::AbsDiff(a, b);
1845  return F128::CmpLe(tmp, eps);
1846 }
1847 
1848 // r[i] = clamp(value[i], min[i], max[i])
1849 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT {
1850  return F128::Min(max, F128::Max(min, value));
1851 }
1852 
1853 // r[i] = absf(value[i]) <= bounds[i] ? 0xFFFFFFFF : 0
1854 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT {
1855 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1856  uint32x4_t tmp = vcaleq_f32(value, bounds);
1857  return vreinterpretq_f32_u32(tmp);
1858 #else
1859  return F128::CmpLe(F128::Abs(value), bounds);
1860 #endif
1861 }
1862 
1863 NLIB_M(f128) F128::CmpEqZero(f128arg value) NLIB_NOEXCEPT {
1864 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1865  return vreinterpretq_f32_u32(vceqzq_f32(value));
1866 #else
1867  return F128::CmpEq(value, F128::SetZero());
1868 #endif
1869 }
1870 
1871 NLIB_M(f128) F128::CmpLtZero(f128arg value) NLIB_NOEXCEPT {
1872 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1873  return vreinterpretq_f32_u32(vcltzq_f32(value));
1874 #else
1875  return F128::CmpLt(value, F128::SetZero());
1876 #endif
1877 }
1878 
1879 NLIB_M(f128) F128::CmpLeZero(f128arg value) NLIB_NOEXCEPT {
1880 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1881  return vreinterpretq_f32_u32(vclezq_f32(value));
1882 #else
1883  return F128::CmpLe(value, F128::SetZero());
1884 #endif
1885 }
1886 
1887 NLIB_M(f128) F128::CmpGtZero(f128arg value) NLIB_NOEXCEPT {
1888 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1889  return vreinterpretq_f32_u32(vcgtzq_f32(value));
1890 #else
1891  return F128::CmpGt(value, F128::SetZero());
1892 #endif
1893 }
1894 
1895 NLIB_M(f128) F128::CmpGeZero(f128arg value) NLIB_NOEXCEPT {
1896 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1897  return vreinterpretq_f32_u32(vcgezq_f32(value));
1898 #else
1899  return F128::CmpGe(value, F128::SetZero());
1900 #endif
1901 }
1902 
1903 NLIB_M(f128) F128::CmpNeZero(f128arg value) NLIB_NOEXCEPT {
1904 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1905  return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1906 #else
1907  return F128::CmpNe(value, F128::SetZero());
1908 #endif
1909 }
1910 
1911 // r[i] = (absf(value[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1912 NLIB_M(f128) F128::CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT {
1913  f128 tmp = F128::Abs(value);
1914  return F128::CmpLe(tmp, eps);
1915 }
1916 
1917 // r[i] = 1.f / value[i] with higher precision, set infinity if value[i] == 0
1918 NLIB_M2(f128) F128::Recp(f128arg value) NLIB_NOEXCEPT {
1919 #ifdef NLIB_F128_SIMD_NOUSE
1920  f128 ret;
1921  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1922  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1923  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1924  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1925  return ret;
1926 #elif defined(NLIB_SSE41)
1927  return _mm_div_ps(F128::SetOne(), value);
1928 #elif defined(NLIB_NEON)
1929 #ifdef __aarch64__
1930  return vdivq_f32(vdupq_n_f32(1.f), value);
1931 #else
1932  float32x4_t x;
1933  x = vrecpeq_f32(value);
1934  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x1 = x0 * (2 - x0 * value)
1935  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x2 = x1 * (2 - x1 * value)
1936  uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1937  float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1938  return result;
1939 #endif
1940 #elif defined(CAFE)
1941  return F128::Div(F128::SetOne(), value);
1942 #endif
1943 }
1944 
1945 // r[i] = 1.f / value[i] with lower precision
1946 NLIB_M(f128) F128::RecpEst(f128arg value) NLIB_NOEXCEPT {
1947 #ifdef NLIB_F128_SIMD_NOUSE
1948  f128 ret;
1949  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1950  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1951  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1952  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1953  return ret;
1954 #elif defined(NLIB_SSE41)
1955  return _mm_rcp_ps(value);
1956 #elif defined(NLIB_NEON)
1957  return vrecpeq_f32(value);
1958 #elif defined(CAFE)
1959  f128 ret;
1960  ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1961  ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1962  return ret;
1963 #endif
1964 }
1965 
1966 // r[i] = sqrtf(value[i]) with higher precision
1967 NLIB_M2(f128) F128::Sqrt(f128arg value) NLIB_NOEXCEPT {
1968 #ifdef NLIB_F128_SIMD_NOUSE
1969  f128 ret;
1970  ret.vec.v[0] = sqrtf(value.vec.v[0]);
1971  ret.vec.v[1] = sqrtf(value.vec.v[1]);
1972  ret.vec.v[2] = sqrtf(value.vec.v[2]);
1973  ret.vec.v[3] = sqrtf(value.vec.v[3]);
1974  return ret;
1975 #elif defined(NLIB_SSE41)
1976  return _mm_sqrt_ps(value);
1977 #elif defined(NLIB_NEON)
1978  f128 iszero = F128::CmpEqZero(value);
1979  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1980  return F128::AndNot(iszero, result);
1981 #elif defined(CAFE)
1982  f128 zero = F128::SetZero();
1983  f128 iszero = F128::CmpEq(zero, value);
1984  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1985  return F128::Select(iszero, zero, result);
1986 #endif
1987 }
1988 
1989 // r[i] = sqrtf(value[i]) with lower precision
1990 NLIB_M(f128) F128::SqrtEst(f128arg value) NLIB_NOEXCEPT {
1991 #ifdef NLIB_F128_SIMD_NOUSE
1992  f128 ret;
1993  ret.vec.v[0] = sqrtf(value.vec.v[0]);
1994  ret.vec.v[1] = sqrtf(value.vec.v[1]);
1995  ret.vec.v[2] = sqrtf(value.vec.v[2]);
1996  ret.vec.v[3] = sqrtf(value.vec.v[3]);
1997  return ret;
1998 #elif defined(NLIB_SSE41)
1999  return _mm_sqrt_ps(value);
2000 #elif defined(NLIB_NEON)
2001  return vrecpeq_f32(vrsqrteq_f32(value));
2002 #elif defined(CAFE)
2003  f128 ret;
2004  ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2005  ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2006  return ret;
2007 #endif
2008 }
2009 
2010 // r[i] = sqrtf(1.f / value[i]) with higher precision
2011 NLIB_M2(f128) F128::RecpSqrt(f128arg value) NLIB_NOEXCEPT {
2012 #ifdef NLIB_F128_SIMD_NOUSE
2013  f128 ret;
2014  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2015  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2016  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2017  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2018  return ret;
2019 #elif defined(NLIB_SSE41)
2020  return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2021 #elif defined(NLIB_NEON)
2022  float32x4_t x;
2023  x = vrsqrteq_f32(value);
2024  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2025  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2026  f128 zeromask = F128::CmpEqZero(value);
2027  return F128::Select(zeromask, F128::SetInfinity(), x);
2028 #elif defined(CAFE)
2029  f32x2 three = __PS_FDUP(3.f);
2030  f32x2 half = __PS_FDUP(0.5f);
2031  f32x2 x;
2032  f32x2 xx;
2033  f32x2 v;
2034  f128 ret;
2035 
2036  v = value.vec.ps[0];
2037  x = __PS_RSQRTE(v);
2038 
2039  xx = __PS_MUL(x, x);
2040  xx = __PS_NMSUB(v, xx, three);
2041  xx = __PS_MUL(x, xx);
2042  x = __PS_MUL(half, xx);
2043 
2044  xx = __PS_MUL(x, x);
2045  xx = __PS_NMSUB(v, xx, three);
2046  xx = __PS_MUL(x, xx);
2047  ret.vec.ps[0] = __PS_MUL(half, xx);
2048 
2049  v = value.vec.ps[1];
2050  x = __PS_RSQRTE(v);
2051 
2052  xx = __PS_MUL(x, x);
2053  xx = __PS_NMSUB(v, xx, three);
2054  xx = __PS_MUL(x, xx);
2055  x = __PS_MUL(half, xx);
2056 
2057  xx = __PS_MUL(x, x);
2058  xx = __PS_NMSUB(v, xx, three);
2059  xx = __PS_MUL(x, xx);
2060  ret.vec.ps[1] = __PS_MUL(half, xx);
2061 
2062  f128 iszero = F128::CmpEq(F128::SetZero(), value);
2063  f128 inf = F128::SetInfinity();
2064  return F128::Select(iszero, inf, ret);
2065 #endif
2066 }
2067 
2068 // r[i] = sqrtf(1.f / value[i]) with lower precision
2069 NLIB_M(f128) F128::RecpSqrtEst(f128arg value) NLIB_NOEXCEPT {
2070 #ifdef NLIB_F128_SIMD_NOUSE
2071  f128 ret;
2072  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2073  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2074  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2075  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2076  return ret;
2077 #elif defined(NLIB_SSE41)
2078  return _mm_rsqrt_ps(value);
2079 #elif defined(NLIB_NEON)
2080  return vrsqrteq_f32(value);
2081 #elif defined(CAFE)
2082  f128 ret;
2083  ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2084  ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2085  return ret;
2086 #endif
2087 }
2088 
2089 template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
2090 NLIB_M(f128) F128::NegateEx(f128arg value) NLIB_NOEXCEPT {
2091  const size_t lane0 = NegateLane0 ? 4 : 0;
2092  const size_t lane1 = NegateLane1 ? 5 : 1;
2093  const size_t lane2 = NegateLane2 ? 6 : 2;
2094  const size_t lane3 = NegateLane3 ? 7 : 3;
2095  return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2096 }
2097 
2098 template <>
2099 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value) NLIB_NOEXCEPT {
2100  return value;
2101 }
2102 
2103 template <>
2104 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value) NLIB_NOEXCEPT {
2105  return F128::Negate(value);
2106 }
2107 
2108 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2109 #define NLIB_ISNAN(vec, idx) \
2110  ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0)
2111 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U)
2112 #endif
2113 
2114 // r[i] = isnan(value[i]) ? 0xFFFFFFFF : 0
2115 NLIB_M2(f128) F128::IsNaN(f128arg value) NLIB_NOEXCEPT {
2116 #if defined(NLIB_F128_SIMD_NOUSE)
2117  f128 ret;
2118  ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2119  ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2120  ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2121  ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2122  return ret;
2123 #elif defined(CAFE)
2124  // on CAFE, value is NaN if value < 0 && -value < 0
2125  f32x2 one = __PS_FDUP(1.f);
2126  f32x2 minus_one = __PS_NEG(one);
2127  f32x2 v0 = value.vec.ps[0];
2128  f32x2 v1 = value.vec.ps[1];
2129  f32x2 t0 = __PS_SEL(v0, one, minus_one);
2130  f32x2 t1 = __PS_SEL(v1, one, minus_one);
2131  f128 ret;
2132  f32x2 v0neg = __PS_NEG(v0);
2133  f32x2 v1neg = __PS_NEG(v1);
2134  ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2135  ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2136  return ret;
2137 #else
2138  return F128::CmpNe(value, value);
2139 #endif
2140 }
2141 
2142 // r[i] = isinf(value[i]) ? 0xFFFFFFFF : 0
2143 NLIB_M(f128) F128::IsInfinite(f128arg value) NLIB_NOEXCEPT {
2144 #if defined(NLIB_F128_SIMD_NOUSE)
2145  f128 ret;
2146  ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2147  ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2148  ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2149  ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2150  return ret;
2151 #elif defined(CAFE)
2152  f128 ret;
2153  f32x2 big_value = __PS_FDUP(FLT_MAX);
2154  ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2155  ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2156  return ret;
2157 #else
2158  f128 inf_value = F128::SetInfinity();
2159  f128 abs_value = F128::Abs(value);
2160  return F128::CmpEq(inf_value, abs_value);
2161 #endif
2162 }
2163 
2164 // for example, 6.54321 -> 7.0, -6.54321 -> -7.0
2165 NLIB_M(f128) F128::Round(f128arg value) NLIB_NOEXCEPT {
2166 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
2167  return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2168 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE)
2169  return vrndaq_f32(value);
2170 #else
2171  // Add and Sub a big value to round the number after the decimal point
2172  f128 sgn = F128::And(value, F128::SetSignMask());
2173  f128 sm = F128::Or(F128::SetValue(0x4B000000U, each_uint32), sgn);
2174  f128 result = F128::Sub(F128::Add(value, sm), sm);
2175  return result;
2176 #endif
2177 }
2178 
2179 // for example, 6.54321 -> 6.0, -6.54321 -> -6.0
2180 NLIB_M2(f128) F128::Truncate(f128arg value) NLIB_NOEXCEPT {
2181 // Note that there is no fraction if |value| > 2^23
2182 // 2^23 = 8388608
2183 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2184  f128 ret;
2185  for (size_t i = 0; i < 4; ++i) {
2186  if (NLIB_ISNAN(value.vec, i)) {
2187  ret.vec.u[i] = 0x7FC00000U;
2188  } else {
2189  ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2190  ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2191  : value.vec.v[i];
2192  }
2193  }
2194  return ret;
2195 #elif defined(NLIB_SSE41)
2196  return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2197 #elif defined(NLIB_NEON)
2198 # if __ARM_ARCH < 8
2199  f128 x = F128::Abs(value);
2200  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2201  f128 cond = F128::CmpLt(x, c_2_23);
2202  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2203  return F128::Select(cond, casted, value);
2204 # else
2205  return vrndq_f32(value);
2206 # endif
2207 #endif
2208 }
2209 
2210 // for example, 6.54321 -> 6.0, -6.54321 -> -7.0
2211 NLIB_M2(f128) F128::Floor(f128arg value) NLIB_NOEXCEPT {
2212 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2213  f128 ret;
2214  ret.vec.v[0] = floorf(value.vec.v[0]);
2215  ret.vec.v[1] = floorf(value.vec.v[1]);
2216  ret.vec.v[2] = floorf(value.vec.v[2]);
2217  ret.vec.v[3] = floorf(value.vec.v[3]);
2218  return ret;
2219 #elif defined(NLIB_SSE41)
2220  return _mm_floor_ps(value);
2221 #elif defined(NLIB_NEON)
2222 # if __ARM_ARCH < 8
2223  // Note that there is no fraction if |value| > 2^23
2224  // 2^23 = 8388608
2225  f128 x = F128::Abs(value);
2226  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2227  f128 cond = F128::CmpLt(x, c_2_23);
2228  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2229 
2230  // -1 if result is larger
2231  f128 largeMask = F128::CmpGt(casted, value);
2232  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2233  casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(largeMask)));
2234  return F128::Select(cond, casted, value);
2235 # else
2236  return vrndmq_f32(value);
2237 # endif
2238 #endif
2239 }
2240 
2241 // for example, 6.54321 -> 7.0, -6.54321 -> -6.0
2242 NLIB_M2(f128) F128::Ceil(f128arg value) NLIB_NOEXCEPT {
2243 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2244  f128 ret;
2245  ret.vec.v[0] = ceilf(value.vec.v[0]);
2246  ret.vec.v[1] = ceilf(value.vec.v[1]);
2247  ret.vec.v[2] = ceilf(value.vec.v[2]);
2248  ret.vec.v[3] = ceilf(value.vec.v[3]);
2249  return ret;
2250 #elif defined(NLIB_SSE41)
2251  return _mm_ceil_ps(value);
2252 #elif defined(NLIB_NEON)
2253 # if __ARM_ARCH < 8
2254  // Note that there is no fraction if |value| > 2^23
2255  // 2^23 = 8388608
2256  f128 x = F128::Abs(value);
2257  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2258  f128 cond = F128::CmpLt(x, c_2_23);
2259  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2260 
2261  // +1 if result is smaller
2262  f128 smallMask = F128::CmpLt(casted, value);
2263  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2264  casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(smallMask)));
2265  return F128::Select(cond, casted, value);
2266 # else
2267  return vrndpq_f32(value);
2268 # endif
2269 #endif
2270 }
2271 
2272 #ifdef NLIB_F128_SIMD_NOUSE
2273 #undef NLIB_ISNAN
2274 #undef NLIB_ISINF
2275 #endif
2276 
2277 // r[i] = clamp(value[i], { 0, 0, 0, 0 }, { 1, 1, 1, 1 })
2278 NLIB_M(f128) F128::Saturate(f128arg value) NLIB_NOEXCEPT {
2279  return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2280 }
2281 
2282 NLIB_M2(f128) F128::ModAngle(f128arg value) NLIB_NOEXCEPT {
2283  static const float v_1_2pi = 0.15915494309189535f;
2284  static const float v_2pi = 6.283185307179586f;
2285  // value - 2pi * round(value * (1/2pi)) to be [-pi, pi)
2286  const f128 recpTwoPi = F128::SetValue(v_1_2pi, each_float);
2287  f128 round = F128::Round(F128::Mult(value, recpTwoPi));
2288  const f128 twoPi = F128::SetValue(v_2pi, each_float);
2289  return F128::MultSub(twoPi, round, value);
2290 }
2291 
2292 NLIB_M2(f128) F128::Sin(f128arg value) NLIB_NOEXCEPT {
2293  // within [-pi, pi)
2294  f128 x = F128::ModAngle(value);
2295 
2296  // within [-pi/2, pi/2]
2297  // use sin(x) == sin(pi - x), sin(x) == sin(-pi - x)
2298  // |x| <= pi/2 -> x = x
2299  // x > pi/2 -> x = pi - x
2300  // x < -pi/2 -> x = -pi - x
2301  f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2302  f128 pi = F128::SetValue<0>(sin_cvalue, each_select32);
2303  f128 pidiv2 = F128::SetValue<1>(sin_cvalue, each_select32);
2304 
2305  f128 xabs = F128::Abs(value);
2306  f128 xsign = F128::And(F128::SetSignMask(), x);
2307  f128 mypi = F128::Or(xsign, pi);
2308  f128 pi_x = F128::Sub(mypi, x);
2309  f128 cond = F128::CmpLe(xabs, pidiv2);
2310  x = F128::Select(cond, x, pi_x);
2311 
2312  f128 xx = F128::Mult(x, x);
2313  f128 coeff = F128::LoadA16(sin_coeff_);
2314  f128 result;
2315  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2316 
2317  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2318  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2319  result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue, each_select32));
2320  result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue, each_select32));
2321  result = F128::Mult(xx, result);
2322  result = F128::MultSub(result, x, x);
2323  return result;
2324 }
2325 
2326 NLIB_M2(f128) F128::Cos(f128arg value) NLIB_NOEXCEPT {
2327  // within [-pi, pi)
2328  f128 x = F128::ModAngle(value);
2329 
2330  // within [-pi/2, pi/2]
2331  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2332  // |x| <= pi/2 -> x = x
2333  // x > pi/2 -> x = pi - x
2334  // x < -pi/2 -> x = -pi - x
2335  f128 cvalue = F128::LoadA16(cos_cvalue_);
2336 
2337  f128 xabs = F128::Abs(value);
2338  f128 xsign = F128::And(F128::SetSignMask(), x);
2339  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2340  f128 pi_x = F128::Sub(mypi, x);
2341  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2342  x = F128::Select(cond, x, pi_x);
2343 
2344  // +1 if [-pi/2, pi/2], -1 otherwise
2345  f128 sign = F128::AndNot(cond, F128::SetSignMask());
2346 
2347  // xx = x^2
2348  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2349  f128 xx = F128::Mult(x, x);
2350  f128 coeff = F128::LoadA16(cos_coeff_);
2351  f128 result;
2352  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2353 
2354  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2355  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2356  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2357  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2358  result = F128::MultSub(xx, result, F128::SetOne());
2359  result = F128::Xor(sign, result);
2360  return result;
2361 }
2362 
2363 NLIB_M2(f128x2) F128::SinCos(f128arg value) NLIB_NOEXCEPT {
2364  // within [-pi, pi)
2365  const f128 signmask = F128::SetSignMask();
2366  f128 x = F128::ModAngle(value);
2367 
2368  // within [-pi/2, pi/2]
2369  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2370  // |x| <= pi/2 -> x = x
2371  // x > pi/2 -> x = pi - x
2372  // x < -pi/2 -> x = -pi - x
2373  f128 cvalue = F128::LoadA16(cos_cvalue_);
2374 
2375  f128 xabs = F128::Abs(value);
2376  f128 xsign = F128::And(signmask, x);
2377  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2378  f128 pi_x = F128::Sub(mypi, x);
2379  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2380  x = F128::Select(cond, x, pi_x);
2381 
2382  // +1 if [-pi/2, pi/2], -1 otherwise
2383  f128 sign = F128::AndNot(cond, signmask);
2384 
2385  // xx = x^2
2386  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2387  f128 xx = F128::Mult(x, x);
2388  f128x2 ret;
2389 
2390  // cos
2391  {
2392  f128 coeff = F128::LoadA16(cos_coeff_);
2393  f128 result;
2394  result =
2395  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2396 
2397  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2398  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2399  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2400  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2401  result = F128::MultSub(xx, result, F128::SetOne());
2402 
2403  ret.val[1] = F128::Xor(sign, result); // cos
2404  }
2405 
2406  // sin
2407  {
2408  f128 coeff = F128::LoadA16(sin_coeff_);
2409  f128 result;
2410  result =
2411  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2412 
2413  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2414  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2415  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2], each_float));
2416  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3], each_float));
2417  result = F128::Mult(xx, result);
2418  ret.val[0] = F128::MultSub(result, x, x); // sin
2419  }
2420  return ret;
2421 }
2422 
2423 NLIB_M2(f128) F128::ArcTan(f128arg value) NLIB_NOEXCEPT {
2424  // |value| <= 1 -> atan(value)
2425  // value > 1 -> pi/2 - atan(1/value)
2426  // value < -1 -> -pi/2 - atan(1/value)
2427  f128 cmp, value_sign;
2428  {
2429  f128 one = F128::SetOne();
2430 
2431  // value_sign:
2432  // 1 if value > 1,
2433  // -1 if value < -1
2434  value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2435  cmp = F128::CmpLe(F128::Abs(value), one);
2436  }
2437  f128 x = F128::Select(cmp, value, F128::Recp(value));
2438 
2439  // atan(x) = x - 1/3 * x^3 + ... + (-1)^n/(2n+1) * x^(2n+1)
2440  // = x(1 - xx(1/3 - xx(1/5 - xx(1/7 - xx(1/9 - xx(1/11 - xx(1/13 - xx(1/15 - xx(1/17)...)
2441  // NOTE:
2442  // DO NOT USE TAYLOR SERIES
2443  // minmax approximation(the output of Remez algorithm)
2444  f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2445  f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2446  f128 xx = F128::Mult(x, x);
2447  f128 result;
2448  result = F128::MultSub<3>(coeff1, xx, F128::SetValue<2>(coeff1, each_select32), each_select32);
2449  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1, each_select32));
2450  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1, each_select32));
2451  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0, each_select32));
2452  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0, each_select32));
2453  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0, each_select32));
2454  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0, each_select32));
2455 
2456  result = F128::Mult(result, x);
2457  result = F128::MultSub(xx, result, x);
2458 
2459  f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2460  f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2461  result = F128::Select(cmp, result, result_another);
2462  return result;
2463 }
2464 
2465 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT {
2466  // y / x -> value
2467  // 0 / - -> pi | sign(y)
2468  // 0 / + -> 0 | sign(y)
2469  // y / Inf -> 0 | sign(y)
2470  // y / -Inf -> pi | sign(y)
2471  // y!=0 / 0 -> pi/2 | sign(y)
2472  // y!=0 / - -> atan(y/x) + pi | sign(y)
2473  // Inf / x -> pi/2 | sign(y)
2474  // +-Inf / Inf -> pi/4 | sign(y)
2475  // +-Inf / -Inf -> 3pi/4 | sign(y)
2476  // otherwise -> atan(y/x)
2477 
2478  // sx = sign(x), sy = sign(y)
2479  // infx = isinf(x), infy = isinf(y)
2480  // zerox = iszero(x), zeroy = iszero(y)
2481  // posx = x > 0
2482  const f128 signmask = F128::SetSignMask();
2483  // const f128 sx = F128::And(x, signmask);
2484  const f128 sy = F128::And(y, signmask);
2485  const f128 infx = F128::IsInfinite(x);
2486  const f128 infy = F128::IsInfinite(y);
2487  const f128 zerox = F128::CmpEqZero(x);
2488  const f128 zeroy = F128::CmpEqZero(y);
2489  const f128 posx = F128::CmpGtZero(x);
2490 
2491  // v =
2492  // infy ?
2493  // infx ?
2494  // posx ? (pi/4 | sy) : (3pi/4 | sy)
2495  // : (pi/2 | sy)
2496  // : zeroy ?
2497  // posx ? (0 | sy) : (pi | sy)
2498  // : zerox ? (pi/2 | sy) : TrueMask;
2499  const f128 cval = F128::LoadA16(atan2_cvalue_);
2500  const f128 pi = F128::Or(sy, F128::SetValue<0>(cval, each_select32));
2501  const f128 pi_34 = F128::Or(sy, F128::SetValue<1>(cval, each_select32));
2502  const f128 pi_2 = F128::Or(sy, F128::SetValue<2>(cval, each_select32));
2503  const f128 pi_4 = F128::Or(sy, F128::SetValue<3>(cval, each_select32));
2504 
2505  f128 v = F128::Select(
2506  infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2507  F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2508 
2509 // mask = EqInt(v, full);
2510 // result = Atan(y/x) + (posx ? (0 | sy) : (pi | sy))
2511 // return mask ? result : v;
2512 #if defined(NLIB_F128_SIMD_NOUSE)
2513  f128 mask;
2514  mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2515  mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2516  mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2517  mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2518 #elif defined(CAFE)
2519  // select makes 0xFFFFFFFFUL -> 0xFF7FFFFFUL
2520  f128 mask;
2521  mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2522  mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2523  mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2524  mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2525 #else
2526  f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v),
2527  I128::SetValue(-1, each_int8)));
2528 #endif
2529  f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2530  return F128::Select(mask, result, v);
2531 }
2532 
2533 NLIB_M2(f128) F128::ArcSin(f128arg value) NLIB_NOEXCEPT {
2534  // asin(x) = atan2 (x, sqrt ((1.0 + x) * (1.0 - x)))
2535  f128 one = F128::SetOne();
2536  f128 tmp = F128::MultSub(value, value, one);
2537  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2538  return F128::ArcTan2(value, argx);
2539 }
2540 
2541 NLIB_M2(f128) F128::ArcCos(f128arg value) NLIB_NOEXCEPT {
2542  // acos(x) = atan2 (sqrt ((1.0 + x) * (1.0 - x)), x)
2543  f128 one = F128::SetOne();
2544  f128 tmp = F128::MultSub(value, value, one);
2545  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2546  return F128::ArcTan2(argx, value);
2547 }
2548 
2549 // see _mm_movemask_ps of SSE
2550 NLIB_M2(int) F128::MoveMask(f128arg value) NLIB_NOEXCEPT { // NOLINT
2551 #ifdef NLIB_F128_SIMD_NOUSE
2552  uint8_t ret = 0;
2553  ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2554  ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2555  ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2556  ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2557  return ret;
2558 #elif defined(NLIB_SSE41)
2559  return static_cast<uint8_t>(_mm_movemask_ps(value));
2560 #elif defined(NLIB_NEON)
2561  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2562  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2563  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2564  uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2565 # ifdef __aarch64__
2566  return vaddvq_u32(a);
2567 # else
2568  uint16x4_t tmp = vmovn_u32(a);
2569  tmp = vpadd_u16(tmp, tmp);
2570  tmp = vpadd_u16(tmp, tmp);
2571  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2572 # endif
2573 #elif defined(CAFE)
2574  int tmp = (value.vec.u[0] >> 31);
2575  tmp |= (value.vec.u[1] >> 30) & 2;
2576  tmp |= (value.vec.u[2] >> 29) & 4;
2577  tmp |= (value.vec.u[3] >> 28) & 8;
2578  return tmp;
2579 #endif
2580 }
2581 
2582 // true if value[i] == 0 for all i
2583 NLIB_M2(bool) F128::IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT { // NOLINT
2584 #ifdef NLIB_F128_SIMD_NOUSE
2585  return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2586 #elif defined(NLIB_SSE41)
2587  i128 casted = F128::CastToI128(value);
2588  return _mm_testz_si128(casted, casted) != 0;
2589 #elif defined(NLIB_NEON)
2590 # ifdef __aarch64__
2591  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2592  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2593 # else
2594  int32x4_t casted = vreinterpretq_s32_f32(value);
2595  int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2596  return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2597 # endif
2598 #elif defined(CAFE)
2599  uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2600  return (tmp & 0x80000000U) == 0;
2601 #endif
2602 }
2603 
2604 // true if value[i] == 0xFFFFFFFF for all i
2605 NLIB_M2(bool) F128::IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT { // NOLINT
2606 #ifdef NLIB_F128_SIMD_NOUSE
2607  return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2608  value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2609 #elif defined(NLIB_SSE41)
2610  i128 casted = F128::CastToI128(value);
2611  return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2612 #elif defined(NLIB_NEON)
2613 # ifdef __aarch64__
2614  uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2615  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2616 # else
2617  int32x4_t casted = vreinterpretq_s32_f32(value);
2618  int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2619  return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2620 # endif
2621 #elif defined(CAFE)
2622  uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2623  return (tmp & 0x80000000U) != 0;
2624 #endif
2625 }
2626 
2627 template <size_t N>
2628 // r = value[N]
2629 NLIB_M(float) F128::GetFloatFromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2630  NLIB_STATIC_ASSERT(N < 4);
2631 #ifdef NLIB_F128_SIMD_NOUSE
2632  return value.vec.v[N];
2633 #elif defined(NLIB_SSE41)
2634  float dest;
2635  _MM_EXTRACT_FLOAT(dest, value, N);
2636  return dest;
2637 #elif defined(NLIB_NEON)
2638  return vgetq_lane_f32(value, N);
2639 #elif defined(CAFE)
2640  return value.vec.ps[N / 2][N % 2];
2641 #endif
2642 }
2643 
2644 template <size_t N>
2645 // r = *reinterpret_cast<uint32_t*>(&value[N])
2646 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2647  NLIB_STATIC_ASSERT(N < 4);
2648 #ifdef NLIB_F128_SIMD_NOUSE
2649  return value.vec.u[N];
2650 #elif defined(NLIB_SSE41)
2651  return _mm_extract_ps(value, N);
2652 #elif defined(NLIB_NEON)
2653  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2654  return vgetq_lane_u32(tmp, N);
2655 #elif defined(CAFE)
2656  return value.vec.u[N];
2657 #endif
2658 }
2659 
2660 // r = value[idx]
2661 NLIB_M2(float) F128::GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT { // NOLINT
2662 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2663  return value.vec.v[idx];
2664 #elif defined(NLIB_SSE41)
2665  float dest;
2666  switch (idx) {
2667  case 0:
2668  _MM_EXTRACT_FLOAT(dest, value, 0);
2669  break;
2670  case 1:
2671  _MM_EXTRACT_FLOAT(dest, value, 1);
2672  break;
2673  case 2:
2674  _MM_EXTRACT_FLOAT(dest, value, 2);
2675  break;
2676  case 3:
2677  _MM_EXTRACT_FLOAT(dest, value, 3);
2678  break;
2679  default:
2680  NLIB_ASSUME(0);
2681  break;
2682  }
2683  return dest;
2684 #elif defined(NLIB_NEON)
2685  switch (idx) {
2686  case 0:
2687  return vgetq_lane_f32(value, 0);
2688  case 1:
2689  return vgetq_lane_f32(value, 1);
2690  case 2:
2691  return vgetq_lane_f32(value, 2);
2692  case 3:
2693  return vgetq_lane_f32(value, 3);
2694  default:
2695  NLIB_ASSUME(0);
2696  break;
2697  }
2698 #endif
2699 }
2700 
2701 // r = *reinterpret_cast<uint32_t*>(&value[idx])
2702 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT {
2703 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2704  return value.vec.u[idx];
2705 #elif defined(NLIB_SSE41)
2706  switch (idx) {
2707  case 0:
2708  return static_cast<uint32_t>(_mm_extract_ps(value, 0));
2709  case 1:
2710  return static_cast<uint32_t>(_mm_extract_ps(value, 1));
2711  case 2:
2712  return static_cast<uint32_t>(_mm_extract_ps(value, 2));
2713  case 3:
2714  return static_cast<uint32_t>(_mm_extract_ps(value, 3));
2715  default:
2716  NLIB_ASSUME(0);
2717  break;
2718  }
2719 #elif defined(NLIB_NEON)
2720  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2721  switch (idx) {
2722  case 0:
2723  return vgetq_lane_u32(tmp, 0);
2724  case 1:
2725  return vgetq_lane_u32(tmp, 1);
2726  case 2:
2727  return vgetq_lane_u32(tmp, 2);
2728  case 3:
2729  return vgetq_lane_u32(tmp, 3);
2730  default:
2731  NLIB_ASSUME(0);
2732  break;
2733  }
2734 #endif
2735 }
2736 
2737 template <size_t N>
2738 // r = value, r[N] = v
2739 NLIB_M(f128) F128::SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT { // NOLINT
2740  NLIB_STATIC_ASSERT(N < 4);
2741 #ifdef NLIB_F128_SIMD_NOUSE
2742  f128 ret = value;
2743  ret.vec.v[N] = v;
2744  return ret;
2745 #elif defined(NLIB_SSE41)
2746  f128 tmp = _mm_set_ss(v);
2747  return _mm_insert_ps(value, tmp, N << 4);
2748 #elif defined(NLIB_NEON)
2749  return __builtin_constant_p(v) ?
2750  F128::Permute<N == 0 ? 4 : 0,
2751  N == 1 ? 5 : 1,
2752  N == 2 ? 6 : 2,
2753  N == 3 ? 7 : 3>(value, vdupq_n_f32(v)) :
2754  vsetq_lane_f32(v, value, N);
2755 #elif defined(CAFE)
2756  f128 ret = value;
2757  ret.vec.ps[N / 2][N % 2] = v;
2758  return ret;
2759 #endif
2760 }
2761 
2762 // r = value, r[i] = v
2763 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT {
2764 #ifdef NLIB_F128_SIMD_NOUSE
2765  f128 ret = value;
2766  ret.vec.v[i] = v;
2767  return ret;
2768 #elif defined(NLIB_SSE41)
2769  f128 tmp = _mm_set_ss(v);
2770  switch (i) {
2771  case 0:
2772  return _mm_insert_ps(value, tmp, 0x00);
2773  case 1:
2774  return _mm_insert_ps(value, tmp, 0x10);
2775  case 2:
2776  return _mm_insert_ps(value, tmp, 0x20);
2777  case 3:
2778  return _mm_insert_ps(value, tmp, 0x30);
2779  default:
2780  NLIB_ASSUME(0);
2781  break;
2782  }
2783 #elif defined(NLIB_NEON)
2784  switch (i) {
2785  case 0:
2786  return F128::SetFloatToLane<0>(value, v);
2787  case 1:
2788  return F128::SetFloatToLane<1>(value, v);
2789  case 2:
2790  return F128::SetFloatToLane<2>(value, v);
2791  case 3:
2792  return F128::SetFloatToLane<3>(value, v);
2793  default:
2794  NLIB_ASSUME(0);
2795  break;
2796  }
2797 #elif defined(CAFE)
2798  f128 ret = value;
2799  switch (i) {
2800  case 0:
2801  ret.vec.ps[0][0] = v;
2802  break;
2803  case 1:
2804  ret.vec.ps[0][1] = v;
2805  break;
2806  case 2:
2807  ret.vec.ps[1][0] = v;
2808  break;
2809  default:
2810  ret.vec.ps[1][1] = v;
2811  break;
2812  }
2813  return ret;
2814 #endif
2815 }
2816 
2817 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
2818 namespace detail {
2819 
2820 template <bool IsHighA, bool IsHighB>
2821 float32x2_t F64Merge(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT;
2822 
2823 template <>
2824 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2825 #ifdef __aarch64__
2826  return vtrn1_f32(a, b);
2827 #else
2828  return vtrn_f32(a, b).val[0];
2829 #endif
2830 };
2831 
2832 template <>
2833 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2834 #ifdef __aarch64__
2835  return vtrn1_f32(vrev64_f32(a), b);
2836 #else
2837  return vtrn_f32(vrev64_f32(a), b).val[0];
2838 #endif
2839 };
2840 
2841 template <>
2842 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2843 #ifdef __aarch64__
2844  return vtrn1_f32(a, vrev64_f32(b));
2845 #else
2846  return vtrn_f32(a, vrev64_f32(b)).val[0];
2847 #endif
2848 };
2849 
2850 template <>
2851 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2852 #ifdef __aarch64__
2853  return vtrn2_f32(a, b);
2854 #else
2855  return vtrn_f32(a, b).val[1];
2856 #endif
2857 };
2858 
2859 template <int Z>
2860 float32x2_t F128SwizzleGet64(f128arg value) NLIB_NOEXCEPT;
2861 
2862 template <>
2863 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<0>(f128arg value) NLIB_NOEXCEPT {
2864  return vget_low_f32(value);
2865 }
2866 
2867 template <>
2868 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<1>(f128arg value) NLIB_NOEXCEPT {
2869  return vget_high_f32(value);
2870 }
2871 
2872 template <int X0, int X1>
2873 struct F128SwizzleHelper2 {
2874  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2875  float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2876  float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2877  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2878  }
2879 };
2880 
2881 template <int X>
2882 struct F128SwizzleHelper2<X, X> {
2883  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2884  float32x2_t x = F128SwizzleGet64<X / 2>(value);
2885  return vdup_lane_f32(x, (X & 1));
2886  }
2887 };
2888 
2889 template <>
2890 struct F128SwizzleHelper2<0, 1> {
2891  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2892  return vget_low_f32(value);
2893  }
2894 };
2895 
2896 template <>
2897 struct F128SwizzleHelper2<0, 2> {
2898  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2899 #ifdef __aarch64__
2900  return vget_low_f32(vuzp1q_f32(value, value));
2901 #else
2902  float32x2_t lo = vget_low_f32(value);
2903  float32x2_t hi = vget_high_f32(value);
2904  return vzip_f32(lo, hi).val[0];
2905 #endif
2906  }
2907 };
2908 
2909 template <>
2910 struct F128SwizzleHelper2<0, 3> {
2911  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2912  float32x2_t lo = vget_low_f32(value);
2913  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2914 #ifdef __aarch64__
2915  return vzip1_f32(lo, hi);
2916 #else
2917  return vzip_f32(lo, hi).val[0];
2918 #endif
2919  }
2920 };
2921 
2922 template <>
2923 struct F128SwizzleHelper2<1, 0> {
2924  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2925  return vrev64_f32(vget_low_f32(value));
2926  }
2927 };
2928 
2929 template <>
2930 struct F128SwizzleHelper2<1, 2> {
2931  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2932  float32x2_t lo = vget_low_f32(value);
2933  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2934 #ifdef __aarch64__
2935  return vzip2_f32(lo, hi);
2936 #else
2937  return vzip_f32(lo, hi).val[1];
2938 #endif
2939  }
2940 };
2941 
2942 template <>
2943 struct F128SwizzleHelper2<1, 3> {
2944  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2945 #ifdef __aarch64__
2946  return vget_low_f32(vuzp2q_f32(value, value));
2947 #else
2948  float32x2_t lo = vget_low_f32(value);
2949  float32x2_t hi = vget_high_f32(value);
2950  return vzip_f32(lo, hi).val[1];
2951 #endif
2952  }
2953 };
2954 
2955 template <>
2956 struct F128SwizzleHelper2<2, 0> {
2957  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2958 #ifdef __aarch64__
2959  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2960 #else
2961  float32x2_t lo = vget_low_f32(value);
2962  float32x2_t hi = vget_high_f32(value);
2963  return vzip_f32(hi, lo).val[0];
2964 #endif
2965  }
2966 };
2967 
2968 template <>
2969 struct F128SwizzleHelper2<2, 1> {
2970  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2971 #ifdef __aarch64__
2972  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2973 #else
2974  float32x2_t lo = vget_low_f32(value);
2975  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2976  return vzip_f32(hi, lo).val[1];
2977 #endif
2978  }
2979 };
2980 
2981 template <>
2982 struct F128SwizzleHelper2<2, 3> {
2983  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2984  return vget_high_f32(value);
2985  }
2986 };
2987 
2988 template <>
2989 struct F128SwizzleHelper2<3, 0> {
2990  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2991  float32x2_t lo = vget_low_f32(value);
2992  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2993 #ifdef __aarch64__
2994  return vzip1_f32(hi, lo);
2995 #else
2996  return vzip_f32(hi, lo).val[0];
2997 #endif
2998  }
2999 };
3000 
3001 template <>
3002 struct F128SwizzleHelper2<3, 1> {
3003  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3004  float32x2_t lo = vget_low_f32(value);
3005  float32x2_t hi = vget_high_f32(value);
3006 #ifdef __aarch64__
3007  return vzip2_f32(hi, lo);
3008 #else
3009  return vzip_f32(hi, lo).val[1];
3010 #endif
3011  }
3012 };
3013 
3014 template <>
3015 struct F128SwizzleHelper2<3, 2> {
3016  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3017  return vrev64_f32(vget_high_f32(value));
3018  }
3019 };
3020 
3021 template <int V0, int V1, int V2, int V3>
3022 struct F128SwizzleHelper {
3023  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3024  return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3025  detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3026  }
3027 };
3028 
3029 template <int Vx, int Vy>
3030 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3031  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3032  float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3033  return vcombine_f32(tmp, tmp);
3034  }
3035 };
3036 
3037 template <int V>
3038 struct F128SwizzleHelper<V, V, V, V> {
3039  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3040  return F128::SetValue<V>(value, each_select32);
3041  }
3042 };
3043 
3044 } // namespace detail
3045 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3046 namespace detail {
3047 
3048 template <int X0, int X1>
3049 struct F128SwizzleHelper {
3050  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT;
3051 };
3052 
3053 template<>
3054 struct F128SwizzleHelper<0, 0> {
3055  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3056  (void)v1;
3057  return __PS_MERGE00(v0, v0);
3058  }
3059 };
3060 
3061 template<>
3062 struct F128SwizzleHelper<0, 1> {
3063  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3064  (void)v1;
3065  return v0;
3066  }
3067 };
3068 
3069 template<>
3070 struct F128SwizzleHelper<0, 2> {
3071  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3072  return __PS_MERGE00(v0, v1);
3073  }
3074 };
3075 
3076 template<>
3077 struct F128SwizzleHelper<0, 3> {
3078  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3079  return __PS_MERGE01(v0, v1);
3080  }
3081 };
3082 
3083 template<>
3084 struct F128SwizzleHelper<1, 0> {
3085  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3086  (void)v1;
3087  return __PS_MERGE10(v0, v0);
3088  }
3089 };
3090 
3091 template<>
3092 struct F128SwizzleHelper<1, 1> {
3093  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3094  (void)v1;
3095  return __PS_MERGE11(v0, v0);
3096  }
3097 };
3098 
3099 template<>
3100 struct F128SwizzleHelper<1, 2> {
3101  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3102  return __PS_MERGE10(v0, v1);
3103  }
3104 };
3105 
3106 template<>
3107 struct F128SwizzleHelper<1, 3> {
3108  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3109  return __PS_MERGE11(v0, v1);
3110  }
3111 };
3112 
3113 template<>
3114 struct F128SwizzleHelper<2, 0> {
3115  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3116  return __PS_MERGE00(v1, v0);
3117  }
3118 };
3119 
3120 template<>
3121 struct F128SwizzleHelper<2, 1> {
3122  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3123  return __PS_MERGE01(v1, v0);
3124  }
3125 };
3126 
3127 template<>
3128 struct F128SwizzleHelper<2, 2> {
3129  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3130  (void)v0;
3131  return __PS_MERGE00(v1, v1);
3132  }
3133 };
3134 
3135 template<>
3136 struct F128SwizzleHelper<2, 3> {
3137  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3138  (void)v0;
3139  return v1;
3140  }
3141 };
3142 
3143 template<>
3144 struct F128SwizzleHelper<3, 0> {
3145  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3146  return __PS_MERGE10(v1, v0);
3147  }
3148 };
3149 
3150 template<>
3151 struct F128SwizzleHelper<3, 1> {
3152  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3153  return __PS_MERGE11(v1, v0);
3154  }
3155 };
3156 
3157 template<>
3158 struct F128SwizzleHelper<3, 2> {
3159  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3160  (void)v0;
3161  return __PS_MERGE10(v1, v1);
3162  }
3163 };
3164 
3165 template<>
3166 struct F128SwizzleHelper<3, 3> {
3167  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3168  (void)v0;
3169  return __PS_MERGE11(v1, v1);
3170  }
3171 };
3172 
3173 } // namespace detail
3174 #endif
3175 
3176 template <int V0, int V1, int V2, int V3>
3177 // r[0] = value[V0], r[1] = value[V1], r[2] = value[V2], r[3] = value[V3]
3178 NLIB_M(f128) F128::Swizzle(f128arg value) NLIB_NOEXCEPT {
3179  NLIB_STATIC_ASSERT(V0 < 4);
3180  NLIB_STATIC_ASSERT(V1 < 4);
3181  NLIB_STATIC_ASSERT(V2 < 4);
3182  NLIB_STATIC_ASSERT(V3 < 4);
3183 #if defined(NLIB_F128_SIMD_NOUSE)
3184  f128 ret;
3185  ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3186  ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3187  ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3188  ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3189  return ret;
3190 #elif __has_builtin(__builtin_shufflevector)
3191  return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3192 #elif defined(NLIB_SSE41)
3193  return _mm_shuffle_ps(value, value,
3194  _MM_SHUFFLE(V3 != -1 ? V3 : 3,
3195  V2 != -1 ? V2 : 2,
3196  V1 != -1 ? V1 : 1,
3197  V0 != -1 ? V0 : 0));
3198 #elif defined(NLIB_NEON)
3199  return detail::F128SwizzleHelper<
3200  V0 != -1 ? V0 : 0,
3201  V1 != -1 ? V1 : 1,
3202  V2 != -1 ? V2 : 2,
3203  V3 != -1 ? V3 : 3>::Swizzle(value);
3204 #elif defined(CAFE)
3205  f128 ret;
3206  ret.vec.ps[0] = detail::F128SwizzleHelper<
3207  (V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3208  ret.vec.ps[1] = detail::F128SwizzleHelper<
3209  (V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3210  return ret;
3211 #endif
3212 }
3213 
3214 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3215 // Swizzle specialization for NEON
3216 template <>
3217 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value) NLIB_NOEXCEPT {
3218 #ifdef __aarch64__
3219  return vzip1q_f32(value, value);
3220 #else
3221  return vzipq_f32(value, value).val[0];
3222 #endif
3223 }
3224 template <>
3225 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value) NLIB_NOEXCEPT {
3226 #ifdef __aarch64__
3227  return vtrn1q_f32(value, value);
3228 #else
3229  return vtrnq_f32(value, value).val[0];
3230 #endif
3231 }
3232 template <>
3233 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value) NLIB_NOEXCEPT {
3234  return value;
3235 }
3236 template <>
3237 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value) NLIB_NOEXCEPT {
3238 #ifdef __aarch64__
3239  return vuzp1q_f32(value, value);
3240 #else
3241  return vuzpq_f32(value, value).val[0];
3242 #endif
3243 }
3244 template <>
3245 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value) NLIB_NOEXCEPT {
3246  return vrev64q_f32(value);
3247 }
3248 template <>
3249 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3250 #ifdef __aarch64__
3251  return vtrn2q_f32(value, value);
3252 #else
3253  return vtrnq_f32(value, value).val[1];
3254 #endif
3255 }
3256 template <>
3257 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value) NLIB_NOEXCEPT {
3258  uint32x4_t ival = vreinterpretq_u32_f32(value);
3259  uint32x4_t rotated = vextq_u32(ival, ival, 1);
3260  return vreinterpretq_f32_u32(rotated);
3261 }
3262 template <>
3263 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value) NLIB_NOEXCEPT {
3264 #ifdef __aarch64__
3265  return vuzp2q_f32(value, value);
3266 #else
3267  return vuzpq_f32(value, value).val[1];
3268 #endif
3269 }
3270 template <>
3271 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3272 #ifdef __aarch64__
3273  return vzip2q_f32(value, value);
3274 #else
3275  return vzipq_f32(value, value).val[1];
3276 #endif
3277 }
3278 template <>
3279 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value) NLIB_NOEXCEPT {
3280  uint32x4_t ival = vreinterpretq_u32_f32(value);
3281  uint32x4_t rotated = vextq_u32(ival, ival, 2);
3282  return vreinterpretq_f32_u32(rotated);
3283 }
3284 template <>
3285 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value) NLIB_NOEXCEPT {
3286  uint32x4_t ival = vreinterpretq_u32_f32(value);
3287  uint32x4_t rotated = vextq_u32(ival, ival, 3);
3288  return vreinterpretq_f32_u32(rotated);
3289 }
3290 #endif
3291 
3292 namespace detail {
3293 
3294 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3295 template <bool UseBlend, bool UseShuffle, int V0, int V1, int V2, int V3>
3296 struct F128PermuteHelper2 {
3297  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3298  f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3299  f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3300  return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3301  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3302  }
3303 };
3304 
3305 template <bool UseShuffle, int V0, int V1, int V2, int V3>
3306 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3307  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3308  return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3309  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3310  }
3311 };
3312 
3313 template <int V0, int V1, int V2, int V3>
3314 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3315  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3316  return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3317  _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3318  }
3319 };
3320 
3321 template <>
3322 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3323  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3324  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3325  return _mm_castsi128_ps(tmp);
3326  }
3327 };
3328 
3329 template <>
3330 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3331  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3332  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3333  return _mm_castsi128_ps(tmp);
3334  }
3335 };
3336 
3337 template <>
3338 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3339  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3340  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3341  return _mm_castsi128_ps(tmp);
3342  }
3343 };
3344 
3345 template <int V>
3346 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3347  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3348  NLIB_STATIC_ASSERT(V > 3);
3349  return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3350  }
3351 };
3352 
3353 template <int V>
3354 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3355  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3356  NLIB_STATIC_ASSERT(V > 3);
3357  return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3358  }
3359 };
3360 
3361 template <int V>
3362 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3363  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3364  NLIB_STATIC_ASSERT(V > 3);
3365  return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3366  }
3367 };
3368 
3369 template <int V>
3370 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3371  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3372  NLIB_STATIC_ASSERT(V > 3);
3373  return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3374  }
3375 };
3376 
3377 template <int V>
3378 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3379  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3380  NLIB_STATIC_ASSERT(V < 4);
3381  return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3382  }
3383 };
3384 
3385 template <int V>
3386 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3387  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3388  NLIB_STATIC_ASSERT(V < 4);
3389  return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3390  }
3391 };
3392 
3393 template <int V>
3394 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3395  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3396  NLIB_STATIC_ASSERT(V < 4);
3397  return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3398  }
3399 };
3400 
3401 template <int V>
3402 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3403  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3404  NLIB_STATIC_ASSERT(V < 4);
3405  return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3406  }
3407 };
3408 
3409 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3410 struct F128PermuteHelper {
3411  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3412  return F128PermuteHelper2<
3413  ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3414  ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3415  V0, V1, V2, V3>::Permute(a, b);
3416  }
3417 };
3418 
3419 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3420 
3421 template <int Z>
3422 float32x2_t F128PermuteGet64(f128arg a, f128arg b) NLIB_NOEXCEPT;
3423 
3424 template <>
3425 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3426  NLIB_UNUSED(b);
3427  return vget_low_f32(a);
3428 }
3429 template <>
3430 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3431  NLIB_UNUSED(b);
3432  return vget_high_f32(a);
3433 }
3434 template <>
3435 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3436  NLIB_UNUSED(a);
3437  return vget_low_f32(b);
3438 }
3439 template <>
3440 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3441  NLIB_UNUSED(a);
3442  return vget_high_f32(b);
3443 }
3444 
3445 template <int X0, int X1>
3446 struct F128PermuteHelper2 {
3447  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3448  float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3449  float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3450  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3451  }
3452 };
3453 
3454 template <int X>
3455 struct F128PermuteHelper2<X, X> {
3456  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3457  float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3458  return vdup_lane_f32(x, (X & 1));
3459  }
3460 };
3461 
3462 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3463 struct F128PermuteHelper {
3464  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3465  return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3466  F128PermuteHelper2<V2, V3>::Permute(a, b));
3467  }
3468 };
3469 
3470 template <>
3471 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3472  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3473  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3474  return vreinterpretq_f32_s32(tmp);
3475  }
3476 };
3477 
3478 template <>
3479 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3480  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3481  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3482  return vreinterpretq_f32_s32(tmp);
3483  }
3484 };
3485 
3486 template <>
3487 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3488  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3489  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3490  return vreinterpretq_f32_s32(tmp);
3491  }
3492 };
3493 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3494 template<int R0, int R1, int VAR0, int VAR1>
3495 struct F128PermuteHelper2 {
3496  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT;
3497 };
3498 
3499 template<int R0, int R1>
3500 struct F128PermuteHelper2<R0, R1, 0, 0> {
3501  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3502  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3503  }
3504 };
3505 
3506 template<int R0, int R1>
3507 struct F128PermuteHelper2<R0, R1, 0, 1> {
3508  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3509  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3510  }
3511 };
3512 
3513 template<int R0, int R1>
3514 struct F128PermuteHelper2<R0, R1, 0, 2> {
3515  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3516  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3517  }
3518 };
3519 
3520 template<int R0, int R1>
3521 struct F128PermuteHelper2<R0, R1, 0, 3> {
3522  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3523  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3524  }
3525 };
3526 
3527 template<int R0, int R1>
3528 struct F128PermuteHelper2<R0, R1, 1, 0> {
3529  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3530  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3531  }
3532 };
3533 
3534 template<int R0, int R1>
3535 struct F128PermuteHelper2<R0, R1, 1, 1> {
3536  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3537  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3538  }
3539 };
3540 
3541 template<int R0, int R1>
3542 struct F128PermuteHelper2<R0, R1, 1, 2> {
3543  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3544  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3545  }
3546 };
3547 
3548 template<int R0, int R1>
3549 struct F128PermuteHelper2<R0, R1, 1, 3> {
3550  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3551  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3552  }
3553 };
3554 
3555 template<int R0, int R1>
3556 struct F128PermuteHelper2<R0, R1, 2, 0> {
3557  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3558  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3559  }
3560 };
3561 
3562 template<int R0, int R1>
3563 struct F128PermuteHelper2<R0, R1, 2, 1> {
3564  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3565  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3566  }
3567 };
3568 
3569 template<int R0, int R1>
3570 struct F128PermuteHelper2<R0, R1, 2, 2> {
3571  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3572  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3573  }
3574 };
3575 
3576 template<int R0, int R1>
3577 struct F128PermuteHelper2<R0, R1, 2, 3> {
3578  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3579  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3580  }
3581 };
3582 
3583 template<int R0, int R1>
3584 struct F128PermuteHelper2<R0, R1, 3, 0> {
3585  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3586  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3587  }
3588 };
3589 
3590 template<int R0, int R1>
3591 struct F128PermuteHelper2<R0, R1, 3, 1> {
3592  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3593  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3594  }
3595 };
3596 
3597 template<int R0, int R1>
3598 struct F128PermuteHelper2<R0, R1, 3, 2> {
3599  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3600  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3601  }
3602 };
3603 
3604 template<int R0, int R1>
3605 struct F128PermuteHelper2<R0, R1, 3, 3> {
3606  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3607  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3608  }
3609 };
3610 
3611 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3612 struct F128PermuteHelper {
3613  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3614  f128 ret;
3615  f32x2 x0 = a.vec.ps[0];
3616  f32x2 x1 = a.vec.ps[1];
3617  f32x2 x2 = b.vec.ps[0];
3618  f32x2 x3 = b.vec.ps[1];
3619  ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3620  ::Permute(x0, x1, x2, x3);
3621  ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3622  ::Permute(x0, x1, x2, x3);
3623  return ret;
3624  }
3625 };
3626 #else
3627 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3628 struct F128PermuteHelper {
3629  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3630  f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3631  F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3632  F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3633  F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3634  return ret;
3635  }
3636 };
3637 #endif
3638 
3639 template <int V0, int V1, int V2, int V3>
3640 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3641  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3642  NLIB_UNUSED(b);
3643  return F128::Swizzle<V0, V1, V2, V3>(a);
3644  }
3645 };
3646 
3647 template <int V0, int V1, int V2, int V3>
3648 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3649  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3650  NLIB_UNUSED(a);
3651  return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3652  }
3653 };
3654 
3655 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3656 // Permute specialization for SSE4.1
3657 template <>
3658 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3659  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3660  return _mm_unpacklo_ps(a, b);
3661  }
3662 };
3663 template <>
3664 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3665  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3666  return _mm_unpacklo_ps(b, a);
3667  }
3668 };
3669 template <>
3670 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3671  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3672  return _mm_unpackhi_ps(a, b);
3673  }
3674 };
3675 template <>
3676 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3677  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3678  return _mm_unpackhi_ps(b, a);
3679  }
3680 };
3681 #endif
3682 
3683 template<int V0, int V1, int V2, int V3>
3684 struct F128PermuteDontCareHelper {
3685  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3686  NLIB_STATIC_ASSERT(V0 < 8);
3687  NLIB_STATIC_ASSERT(V1 < 8);
3688  NLIB_STATIC_ASSERT(V2 < 8);
3689  NLIB_STATIC_ASSERT(V3 < 8);
3690  static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3691  static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3692  return detail::F128PermuteHelper< arg1, arg2,
3693  V0, V1, V2, V3 >::Permute(a, b);
3694  }
3695 };
3696 
3697 template<int V1, int V2, int V3>
3698 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3699  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3700  NLIB_STATIC_ASSERT(V1 < 8);
3701  NLIB_STATIC_ASSERT(V2 < 8);
3702  NLIB_STATIC_ASSERT(V3 < 8);
3703  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3704  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3705  }
3706 };
3707 
3708 template<int V0, int V2, int V3>
3709 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3710  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3711  NLIB_STATIC_ASSERT(V0 < 8);
3712  NLIB_STATIC_ASSERT(V2 < 8);
3713  NLIB_STATIC_ASSERT(V3 < 8);
3714  static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3715  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3716  }
3717 };
3718 
3719 template<int V0, int V1, int V3>
3720 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3721  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3722  NLIB_STATIC_ASSERT(V0 < 8);
3723  NLIB_STATIC_ASSERT(V1 < 8);
3724  NLIB_STATIC_ASSERT(V3 < 8);
3725  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3726  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3727  }
3728 };
3729 
3730 template<int V0, int V1, int V2>
3731 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3732  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3733  NLIB_STATIC_ASSERT(V0 < 8);
3734  NLIB_STATIC_ASSERT(V1 < 8);
3735  NLIB_STATIC_ASSERT(V2 < 8);
3736  static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3737  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3738  }
3739 };
3740 
3741 template<int V2, int V3>
3742 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3743  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3744  NLIB_STATIC_ASSERT(V2 < 8);
3745  NLIB_STATIC_ASSERT(V3 < 8);
3746  static const int V0 = (V2 < 4) ? 0 : 4;
3747  return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3748  }
3749 };
3750 
3751 template<int V1, int V2>
3752 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3753  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3754  NLIB_STATIC_ASSERT(V1 < 8);
3755  NLIB_STATIC_ASSERT(V2 < 8);
3756  static const int V0 = (V1 & 1) ? V1 - 1: V1;
3757  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3758  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3759  }
3760 };
3761 
3762 template<int V0, int V1>
3763 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3764  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3765  NLIB_STATIC_ASSERT(V0 < 8);
3766  NLIB_STATIC_ASSERT(V1 < 8);
3767  static const int V2 = (V1 < 4) ? 2 : 6;
3768  return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3769  }
3770 };
3771 
3772 template<int V0, int V3>
3773 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3774  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3775  NLIB_STATIC_ASSERT(V0 < 8);
3776  NLIB_STATIC_ASSERT(V3 < 8);
3777  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3778  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3779  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3780  }
3781 };
3782 
3783 template<int V0, int V2>
3784 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3785  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3786  NLIB_STATIC_ASSERT(V0 < 8);
3787  NLIB_STATIC_ASSERT(V2 < 8);
3788  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3789  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3790  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3791  }
3792 };
3793 
3794 template<int V1, int V3>
3795 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3796  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3797  NLIB_STATIC_ASSERT(V1 < 8);
3798  NLIB_STATIC_ASSERT(V3 < 8);
3799  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3800  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3801  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3802  }
3803 };
3804 
3805 template<int V>
3806 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3807  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3808  NLIB_STATIC_ASSERT(V < 8);
3809  static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3810  static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3811  static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3812  return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3813  }
3814 };
3815 
3816 template<int V>
3817 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3818  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3819  NLIB_STATIC_ASSERT(V < 8);
3820  static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3821  static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3822  static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3823  return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3824  }
3825 };
3826 
3827 template<int V>
3828 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3829  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3830  NLIB_STATIC_ASSERT(V < 8);
3831  static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3832  static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3833  static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3834  return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3835  }
3836 };
3837 
3838 template<int V>
3839 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3840  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3841  NLIB_STATIC_ASSERT(V < 8);
3842  static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3843  static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3844  static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3845  return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3846  }
3847 };
3848 
3849 template <>
3850 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3851  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3852  NLIB_UNUSED(b);
3853  return a;
3854  }
3855 };
3856 
3857 } // namespace detail
3858 
3859 template <int V0, int V1, int V2, int V3>
3860 // r[0] = V0 < 4 ? a[V0] : b[V0 - 4], .....
3861 NLIB_M(f128) F128::Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3862 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE)
3863  return __builtin_shufflevector(a, b,
3864  (V0 != 8 ? V0 : -1),
3865  (V1 != 8 ? V1 : -1),
3866  (V2 != 8 ? V2 : -1),
3867  (V3 != 8 ? V3 : -1));
3868 #else
3869  return detail::F128PermuteDontCareHelper <
3870  V0 != -1 ? V0 : 8,
3871  V1 != -1 ? V1 : 8,
3872  V2 != -1 ? V2 : 8,
3873  V3 != -1 ? V3 : 8>::Permute(a, b);
3874 #endif
3875 }
3876 
3877 template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
3878 // r[i] = InsertLane(i) ? insert_value[i] : value[i]
3879 // note that splat[0] = splat[1] = splat[2] = splat[3]
3880 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT {
3881 #if defined(NLIB_NEON)
3882  const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3883  const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3884  const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3885  const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3886 #else
3887  // SSE4.1 has _mm_blend_ps()
3888  const int v0 = SplatLane0 ? 4 : 0;
3889  const int v1 = SplatLane1 ? 5 : 1;
3890  const int v2 = SplatLane2 ? 6 : 2;
3891  const int v3 = SplatLane3 ? 7 : 3;
3892 #endif
3893  return F128::Permute<v0, v1, v2, v3>(value, splat);
3894 }
3895 
3896 NLIB_M2(f128) F128::Exp2(f128arg value) NLIB_NOEXCEPT {
3897 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
3898  f128 ret;
3899  ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3900  ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3901  ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3902  ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3903  return ret;
3904 #else
3905  i128 iround = F128::ConvertToI128Round(value);
3906  f128 fround = F128::ConvertFromI128(iround);
3907  f128 x = F128::Sub(value, fround);
3908  f128 xx = F128::Mult(x, x);
3909 
3910  f128 P = F128::LoadA16(F128::exp2_P_);
3911  f128 Q = F128::LoadA16(F128::exp2_Q_);
3912 
3913  f128 px;
3914  // px = P[0]
3915  // px = px * xx + P[1]
3916  // px = px * xx + P[2]
3917  // px = x * px;
3918  px = F128::MultAdd<0>(P, xx, F128::SetValue<1>(P, each_select32), each_select32);
3919  px = F128::MultAdd(px, xx, F128::SetValue<2>(P, each_select32));
3920  px = F128::Mult(x, px);
3921 
3922  f128 qx;
3923  // qx = xx + Q[0]
3924  // qx = qx * xx + Q[1]
3925  qx = F128::Add(xx, F128::SetValue<0>(Q, each_select32));
3926  qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q, each_select32));
3927 
3928  x = F128::Div(px, F128::Sub(qx, px));
3929  x = F128::MultAdd<3>(Q, x, F128::SetValue<2>(Q, each_select32), each_select32);
3930 
3931  // x = x * 2^iround
3932  iround = I128::Add32(iround, I128::SetValue(127, each_int32));
3933  iround = I128::ShiftLeftLogical32(iround, 23);
3934  x = F128::Mult(x, F128::CastFromI128(iround));
3935 
3936  // NOTE:
3937  // overflow not checked
3938  return x;
3939 #endif
3940 }
3941 
3942 NLIB_M(f128) F128::ExpE(f128arg value) NLIB_NOEXCEPT {
3943  static const float log2e = 1.44269504088896340736f;
3944  return Exp2(F128::Mult(log2e, value));
3945 }
3946 
3947 NLIB_M(f128) F128::SinH(f128arg value) NLIB_NOEXCEPT {
3948  static const float log2e = 1.44269504088896340736f;
3949  f128 negOne = F128::SetValue(-1.f, each_float);
3950  f128 v0 = F128::MultAdd(log2e, value, negOne);
3951  f128 v1 = F128::MultSub(log2e, value, negOne);
3952  f128 e0 = Exp2(v0);
3953  f128 e1 = Exp2(v1);
3954  return F128::Sub(e0, e1);
3955 }
3956 
3957 NLIB_M(f128) F128::CosH(f128arg value) NLIB_NOEXCEPT {
3958  static const float log2e = 1.44269504088896340736f;
3959  f128 negOne = F128::SetValue(-1.f, each_float);
3960  f128 v0 = F128::MultAdd(log2e, value, negOne);
3961  f128 v1 = F128::MultSub(log2e, value, negOne);
3962  f128 e0 = Exp2(v0);
3963  f128 e1 = Exp2(v1);
3964  return F128::Add(e0, e1);
3965 }
3966 
3967 NLIB_M(f128) F128::TanH(f128arg value) NLIB_NOEXCEPT {
3968  // 1 - 2 * (1 + expE(2x))
3969  f128 cvalue = F128::LoadA16(tanh_cvalue_);
3970  f128 e = F128::Mult<0>(cvalue, value, each_select32);
3971  e = F128::Exp2(e);
3972  f128 half = F128::SetValue<2>(cvalue, each_select32);
3973  e = F128::MultAdd(half, e, half);
3974  e = F128::Recp(e);
3975  return F128::Sub(F128::SetValue<1>(cvalue, each_select32), e);
3976 }
3977 
3978 NLIB_M2(f128) F128::Tan(f128arg value) NLIB_NOEXCEPT {
3979 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
3980  f128 ret;
3981  ret.vec.v[0] = tanf(value.vec.v[0]);
3982  ret.vec.v[1] = tanf(value.vec.v[1]);
3983  ret.vec.v[2] = tanf(value.vec.v[2]);
3984  ret.vec.v[3] = tanf(value.vec.v[3]);
3985  return ret;
3986 #else
3987  // Cody and Waite algorithm
3988  f128 C = F128::LoadA16(&F128::tan_c_[0]);
3989 
3990  // g = round(value / (pi/2))
3991  f128 g = F128::Round(F128::Mult<0>(C, value, each_select32));
3992  f128 nearXAxis;
3993  {
3994  i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U, each_uint32));
3995  i128 cmp = I128::CmpEq32(t0, I128::SetZero());
3996  nearXAxis = F128::CastFromI128(cmp);
3997  }
3998 
3999  // f = value - (pi/2) * g
4000  f128 f = F128::MultSub<1>(C, g, value, each_select32);
4001  f = F128::MultSub<2>(C, g, f, each_select32);
4002 
4003  f128 nearAxis = F128::CmpNearEqZero(f, F128::SetValue<3>(C, each_select32));
4004 
4005  f128 P = F128::LoadA16(&F128::tan_p_[0]);
4006  f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4007 
4008  f128 ff = F128::Mult(f, f);
4009  f128 one = F128::SetValue<3>(P, each_select32);
4010 
4011  f128 p = F128::MultAdd<2>(P, ff, F128::SetValue<1>(P, each_select32), each_select32);
4012  p = F128::MultAdd(p, ff, F128::SetValue<0>(P, each_select32));
4013  p = F128::MultAdd(p, ff, one);
4014  p = F128::Mult(f, p);
4015 
4016  f128 q = F128::MultAdd<3>(Q, ff, F128::SetValue<2>(Q, each_select32), each_select32);
4017  q = F128::MultAdd(q, ff, F128::SetValue<1>(Q, each_select32));
4018  q = F128::MultAdd(q, ff, F128::SetValue<0>(Q, each_select32));
4019  q = F128::MultAdd(q, ff, one);
4020 
4021  p = F128::Select(nearAxis, f, p);
4022  q = F128::Select(nearAxis, one, q);
4023 
4024  f128 r0 = F128::Div(p, q);
4025  f128 r1 = F128::Negate(F128::Recp(r0));
4026 
4027  return F128::Select(nearXAxis, r0, r1);
4028 #endif
4029 }
4030 
4031 NLIB_M2(f128) F128::Log2(f128arg value) NLIB_NOEXCEPT {
4032 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
4033  static const float scale = 1.4426950408889634f; // 1 / LogE(2.0)
4034  f128 ret;
4035  ret.vec.v[0] = logf(value.vec.v[0]);
4036  ret.vec.v[1] = logf(value.vec.v[1]);
4037  ret.vec.v[2] = logf(value.vec.v[2]);
4038  ret.vec.v[3] = logf(value.vec.v[3]);
4039  return F128::Mult(scale, ret);
4040 #else
4041  // x = frexp(value, &e)
4042  f128 x = F128::And(F128::SetValue(0x807FFFFFU, each_uint32), value);
4043  x = F128::Or(F128::SetValue(127U << 23, each_uint32), x);
4044  i128 e = I128::And(I128::SetValue(0x7F800000U, each_uint32), F128::CastToI128(value));
4045  e = I128::ShiftRightLogical32(e, 23);
4046  e = I128::Sub32(e, I128::SetValue(127U, each_uint32));
4047 
4048  x = F128::Sub(x, F128::SetOne());
4049  f128 z = F128::Mult(x, x);
4050  f128 y;
4051 
4052  f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4053  f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4054  f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4055 
4056  f128 p = F128::SetValue<0>(pq0, each_select32);
4057  p = F128::MultAdd(p, x, F128::SetValue<1>(pq0, each_select32));
4058  p = F128::MultAdd(p, x, F128::SetValue<2>(pq0, each_select32));
4059  p = F128::MultAdd(p, x, F128::SetValue<3>(pq0, each_select32));
4060  p = F128::MultAdd(p, x, F128::SetValue<0>(pq1, each_select32));
4061  p = F128::MultAdd(p, x, F128::SetValue<1>(pq1, each_select32));
4062 
4063  f128 q = F128::Add(x, F128::SetValue<2>(pq1, each_select32));
4064  q = F128::MultAdd(q, x, F128::SetValue<3>(pq1, each_select32));
4065  q = F128::MultAdd(q, x, F128::SetValue<0>(pq2, each_select32));
4066  q = F128::MultAdd(q, x, F128::SetValue<1>(pq2, each_select32));
4067  q = F128::MultAdd(q, x, F128::SetValue<2>(pq2, each_select32));
4068 
4069  y = F128::Mult(z, p);
4070  y = F128::Div(y, q);
4071  y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4072 
4073  f128 result;
4074  {
4075  // do not optimize
4076  f128 log2ea = F128::SetValue<3>(pq2, each_select32);
4077  result = F128::Mult(y, log2ea);
4078  result = F128::MultAdd(log2ea, x, result);
4079  result = F128::Add(result, y);
4080  result = F128::Add(result, x);
4081  result = F128::Add(result, F128::ConvertFromI128(e));
4082  }
4083 
4084  {
4085  f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4086 
4087  // value is NaN -> NaN
4088  f128 is_nan = F128::IsNaN(value);
4089  f128 nan = F128::SetValue<0>(nan_inf, each_select32);
4090  result = F128::Select(is_nan, nan, result);
4091 
4092  f128 is_inf = F128::IsInfinite(value);
4093  f128 is_pos = F128::CmpGtZero(value);
4094 
4095  // value == inf -> +inf
4096  f128 inf = F128::SetValue<1>(nan_inf, each_select32);
4097  f128 is_pos_inf = F128::And(is_inf, is_pos);
4098  result = F128::Select(is_pos_inf, inf, result);
4099 
4100  // value == 0 -> -inf
4101  f128 neg_inf = F128::SetValue<3>(nan_inf, each_select32);
4102  f128 is_zero = F128::CmpEqZero(value);
4103  result = F128::Select(is_zero, neg_inf, result);
4104 
4105  // value < 0 -> -NaN
4106  f128 neg_nan = F128::SetValue<2>(nan_inf, each_select32);
4107  f128 is_neg = F128::CmpLtZero(value);
4108  result = F128::Select(is_neg, neg_nan, result);
4109 
4110  // otherwise -> Log2(value)
4111  }
4112 
4113  return result;
4114 #endif
4115 }
4116 
4117 NLIB_M(f128) F128::LogE(f128arg value) NLIB_NOEXCEPT {
4118 #ifdef NLIB_F128_SIMD_NOUSE
4119  f128 ret;
4120  ret.vec.v[0] = logf(value.vec.v[0]);
4121  ret.vec.v[1] = logf(value.vec.v[1]);
4122  ret.vec.v[2] = logf(value.vec.v[2]);
4123  ret.vec.v[3] = logf(value.vec.v[3]);
4124  return ret;
4125 #else
4126  f128 x = F128::Log2(value);
4127  static const float recp_log2e = 0.6931471805597018f;
4128  return F128::Mult(recp_log2e, x);
4129 #endif
4130 }
4131 
4132 #undef NLIB_M
4133 #undef NLIB_M2
4134 #endif // NLIB_DOXYGEN
4135 
4136 typedef f128 SimdVector;
4137 typedef f128arg SimdVectorArg;
4138 typedef f128 SimdQuaternion;
4139 typedef f128arg SimdQuaternionArg;
4140 typedef f128 SimdPlane;
4141 typedef f128arg SimdPlaneArg;
4142 typedef f128 SimdSphere;
4143 typedef f128arg SimdSphereArg;
4144 
4145 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4146 struct NLIB_ALIGNAS(16) SimdMatrix {
4147 #else
4148 struct SimdMatrix {
4149 #endif
4150  public:
4152  SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) NLIB_NOEXCEPT {
4153  r[0] = r0;
4154  r[1] = r1;
4155  r[2] = r2;
4156  r[3] = r3;
4157  }
4158  SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11, float m12,
4159  float m13, float m20, float m21, float m22, float m23, float m30, float m31,
4160  float m32, float m33) NLIB_NOEXCEPT;
4161  explicit SimdMatrix(const float* p) NLIB_NOEXCEPT;
4162 
4163  public:
4164  f128 r[4];
4165 };
4166 
4167 inline SimdMatrix::SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11,
4168  float m12, float m13, float m20, float m21, float m22, float m23,
4169  float m30, float m31, float m32, float m33) NLIB_NOEXCEPT {
4170  r[0] = F128::SetValue(m00, m01, m02, m03);
4171  r[1] = F128::SetValue(m10, m11, m12, m13);
4172  r[2] = F128::SetValue(m20, m21, m22, m23);
4173  r[3] = F128::SetValue(m30, m31, m32, m33);
4174 }
4175 
4176 inline SimdMatrix::SimdMatrix(const float* p) NLIB_NOEXCEPT {
4177  uintptr_t algn = reinterpret_cast<uintptr_t>(p) & 15;
4178  NLIB_ASSERT((algn & 3) == 0);
4179  switch (algn >> 2) {
4180  case 0:
4181  r[0] = F128::LoadA16(p);
4182  r[1] = F128::LoadA16(p + 4);
4183  r[2] = F128::LoadA16(p + 8);
4184  r[3] = F128::LoadA16(p + 12);
4185  break;
4186  case 1:
4187  r[0] = F128::LoadA4(p);
4188  r[1] = F128::LoadA4(p + 4);
4189  r[2] = F128::LoadA4(p + 8);
4190  r[3] = F128::LoadA4(p + 12);
4191  break;
4192  case 2:
4193  r[0] = F128::LoadA8(p);
4194  r[1] = F128::LoadA8(p + 4);
4195  r[2] = F128::LoadA8(p + 8);
4196  r[3] = F128::LoadA8(p + 12);
4197  break;
4198  default:
4199  NLIB_ASSUME(0);
4200  break;
4201  }
4202 }
4203 
4204 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
4205 typedef const SimdMatrix& SimdMatrixArg;
4206 #else
4207 typedef const SimdMatrix SimdMatrixArg;
4208 #endif
4209 
4210 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE)
4211 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4212  { \
4213  f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \
4214  f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \
4215  f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \
4216  f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \
4217  row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \
4218  row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \
4219  row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \
4220  row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \
4221  }
4222 #elif defined(NLIB_NEON)
4223 # ifdef __aarch64__
4224 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4225  { \
4226  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4227  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4228  uint64x2_t row0_, row1_, row2_, row3_; \
4229  row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4230  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4231  row0 = vreinterpretq_f32_u64(row0_); \
4232  row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4233  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4234  row1 = vreinterpretq_f32_u64(row1_); \
4235  row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4236  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4237  row2 = vreinterpretq_f32_u64(row2_); \
4238  row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4239  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4240  row3 = vreinterpretq_f32_u64(row3_); \
4241  }
4242 # else
4243 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4244  { \
4245  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4246  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4247  row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \
4248  row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \
4249  row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \
4250  row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \
4251  }
4252 # endif
4253 #elif defined(CAFE)
4254 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4255  { \
4256  f32x2 tmp0, tmp1; \
4257  tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \
4258  tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \
4259  row0.vec.ps[0] = tmp0; \
4260  row1.vec.ps[0] = tmp1; \
4261  tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \
4262  tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \
4263  row2.vec.ps[1] = tmp0; \
4264  row3.vec.ps[1] = tmp1; \
4265  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4266  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4267  row0.vec.ps[1] = row2.vec.ps[0]; \
4268  row1.vec.ps[1] = row3.vec.ps[0]; \
4269  row2.vec.ps[0] = tmp0; \
4270  row3.vec.ps[0] = tmp1; \
4271  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4272  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4273  row0.vec.ps[1] = tmp0; \
4274  row1.vec.ps[1] = tmp1; \
4275  }
4276 #endif
4277 
4279  float x;
4280  float y;
4281  float z;
4282 };
4283 
4285  float x;
4286  float y;
4287  float z;
4288  float w;
4289 };
4290 
4291 struct NLIB_VIS_PUBLIC Float2x3 {
4292  float m[2][3];
4293 };
4294 
4296  float m[3][3];
4297 };
4298 
4299 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4300 struct NLIB_ALIGNAS(16) Float3x4 {
4301 #else
4302 struct Float3x4 {
4303 #endif
4304  float m[3][4];
4305 };
4306 
4307 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4308 struct NLIB_ALIGNAS(16) Float4x3 {
4309 #else
4310 struct Float4x3 {
4311 #endif
4312  float m[4][3];
4313 };
4314 
4315 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4316 struct NLIB_ALIGNAS(16) Float4x4 {
4317 #else
4318 struct Float4x4 {
4319 #endif
4320  float m[4][4];
4321 };
4322 
4323 } // namespace simd
4324 NLIB_NAMESPACE_END
4325 
4326 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
float x
3次元ベクトルのx座標です。
Definition: SimdFloat.h:4279
SimdMatrix()
デフォルトコンストラクタです。
Definition: SimdFloat.h:4151
4x4行列を扱う関数が集められたクラスです。
Definition: SimdMatrix.h:15
#define NLIB_ALWAYS_INLINE
コンパイラに関数をインライン展開するように強く示します。
Definition: Platform_unix.h:69
視錐台を表すクラスです。
Definition: SimdGeometry.h:105
クォータニオンを扱う関数が集められたクラスです。
f128arg SimdVectorArg
f128argがtypedefされています。
Definition: SimdFloat.h:4137
128bitの単精度浮動小数点数用SIMDレジスタを2つ持つ型です。
Definition: SimdFloat.h:35
float x
4次元ベクトルのx座標です。
Definition: SimdFloat.h:4285
float y
4次元ベクトルのy座標です。
Definition: SimdFloat.h:4286
整数のSIMD演算を行うためのクラスや関数が実装されています。
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
引数から行列をセットアップします。
Definition: SimdFloat.h:4152
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
Definition: SimdInt.h:44
#define NLIB_VIS_HIDDEN
関数やクラス等のシンボルをライブラリの外部に公開しません。
Definition: Platform_unix.h:60
f128arg SimdSphereArg
f128argがtypedefされています。
Definition: SimdFloat.h:4143
#define NLIB_VIS_PUBLIC
関数やクラス等のシンボルをライブラリの外部に公開します。
Definition: Platform_unix.h:61
OBB(有向境界ボックス)を表すクラスです。中心座標(center)とxyz軸方向の大きさ(extent)及び回転クォータニ...
Definition: SimdGeometry.h:80
static f128 ShiftRight(f128arg a, f128arg b) noexcept
a を右にシフトして空いた部分にb の要素を順にシフトする形で設定します。
Definition: SimdFloat.h:333
空の構造体で単精度浮動小数点数を示すためのタグです。
Definition: SimdFloat.h:54
#define NLIB_ASSUME(cond)
cond が真であることを示してコンパイラに最適化のヒントを与えます。
Definition: Platform.h:581
包含関係の判定を行う関数をまとめたクラスです。
Definition: SimdGeometry.h:268
3次元空間上の球を扱う静的メンバ関数が集められたクラスです。このクラスはインスタンス化できません。 ...
Definition: SimdGeometry.h:40
constexpr const each_float_tag each_float
each_float_tag型の定数オブジェクトで、単精度浮動小数点数を示すためのタグです。
Definition: SimdFloat.h:55
f128arg SimdQuaternionArg
f128argがtypedefされています。
Definition: SimdFloat.h:4139
nlib_i128_t i128
nlib_i128_tがtypedefされています。
Definition: SimdInt.h:63
3次元空間上の平面を扱う関数が集められたクラスです。
Definition: SimdGeometry.h:21
f128arg SimdPlaneArg
f128argがtypedefされています。
Definition: SimdFloat.h:4141
3次元ベクトルの計算を行う関数が集められたクラスです。全ての関数でレーン3に設定された値は無視されます...
Definition: SimdVector3.h:13
static f128 RotateLeft(f128arg value) noexcept
4個の単精度浮動小数点数を左にN 個分回転させます。
Definition: SimdFloat.h:320
距離(の2乗)の計算を行う関数をまとめたクラスです。
Definition: SimdGeometry.h:134
4次元ベクトルをメモリから読み出したりメモリに書き出したりするための型です。float型のx, y, z, wをデータメンバとして保持します。
Definition: SimdFloat.h:4284
const f128 f128arg
const f128, 又はconst f128&がtypedefされています。
Definition: SimdFloat.h:65
4x4行列を保持する構造体です。
Definition: SimdFloat.h:4148
float z
4次元ベクトルのz座標です。
Definition: SimdFloat.h:4287
nlib_f128x2_t f128x2
nlib_f128x2_tがtypedefされています。
Definition: SimdFloat.h:60
f128 SimdSphere
f128がtypedefされています。球を扱う場合に利用されます。
Definition: SimdFloat.h:4142
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いて単精度浮動小数点数のSIMD演算を行うためのクラ...
Definition: SimdFloat.h:80
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:40
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
Definition: Config.h:86
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
Definition: Config.h:80
開発環境別の設定が書かれるファイルです。
4次元ベクトルの計算を行う関数が集められたクラスです。
Definition: SimdVector4.h:11
3次元ベクトルをメモリから読み出したりメモリに書き出したりするための型です。float型のx, y, zをデータメンバとして保持します。
Definition: SimdFloat.h:4278
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
Definition: Config.h:221
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:34
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
Definition: SimdInt.h:50
4x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x3の配列で16バイ...
Definition: SimdFloat.h:4310
3x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x3の配列です。 ...
Definition: SimdFloat.h:4295
空の構造体で32bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:30
static f128 RotateRight(f128arg value) noexcept
4個の単精度浮動小数点数を右にN 個分回転させます。
Definition: SimdFloat.h:327
float y
3次元ベクトルのy座標です。
Definition: SimdFloat.h:4280
nlib_f128_t f128
nlib_f128_tがtypedefされています。
Definition: SimdFloat.h:58
float z
3次元ベクトルのz座標です。
Definition: SimdFloat.h:4281
3次元空間におけるAABB(軸並行境界ボックス)を表すクラスです。最小座標(point_min)と最大座標(point_max)を...
Definition: SimdGeometry.h:61
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:36
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
Definition: Config.h:136
float w
4次元ベクトルのw座標です。
Definition: SimdFloat.h:4288
交差の判定を行う関数をまとめたクラスです。
Definition: SimdGeometry.h:178
f128 SimdQuaternion
f128がtypedefされています。クォータニオンを扱う場合に利用されます。
Definition: SimdFloat.h:4138
4x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x4の配列で16バイ...
Definition: SimdFloat.h:4318
3x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x4の配列で16バイ...
Definition: SimdFloat.h:4302
f128 SimdPlane
f128がtypedefされています。平面を扱う場合に利用されます。
Definition: SimdFloat.h:4140
__m128 nlib_f128_t
128bitの単精度浮動小数点数用SIMDレジスタのための型です。
Definition: SimdFloat.h:34
f128 SimdVector
f128がtypedefされています。3次元ベクトル又は4次元ベクトルを扱う場合に利用されます。 ...
Definition: SimdFloat.h:4136