nlib
SimdFloat.h
Go to the documentation of this file.
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
18 
19 #ifdef NN_PLATFORM_CTR
20 # ifndef __USE_C99_MATH
21 # define __USE_C99_MATH
22 # endif
23 #endif
24 #include <math.h>
25 #include <float.h>
26 
27 #include "nn/nlib/Config.h"
28 #include "nn/nlib/simd/SimdInt.h"
29 
30 #ifndef INFINITY
31 #define INFINITY ((float)(1e+300 * 1e+300))
32 #endif
33 
34 #if !defined(NLIB_SIMD) && !defined(CAFE)
35 #define NLIB_F128_SIMD_NOUSE
36 #endif
37 
38 #ifdef NLIB_F128_SIMD_NOUSE
39 typedef struct {
40  union {
41  float v[4];
42  uint32_t u[4];
43  } vec;
44 } nlib_f128_t;
45 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
46 #elif defined(NLIB_SSE41)
47 typedef __m128 nlib_f128_t;
48 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
49 #elif defined(NLIB_NEON)
50 typedef float32x4_t nlib_f128_t;
51 typedef float32x4x2_t nlib_f128x2_t;
52 #elif defined(CAFE)
53 typedef struct {
54  union {
55  f32x2 ps[2];
56  float v[4];
57  uint32_t u[4];
58  } vec;
59 } nlib_f128_t;
60 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
61 #endif
62 
63 NLIB_NAMESPACE_BEGIN
64 namespace simd {
65 
66 // use each_float for the argument value
67 struct each_float_tag {};
69 
70 // __m128(SSE), float32x4_t(NEON)
71 typedef nlib_f128_t f128;
72 // float32x4x2_t(NEON)
74 
75 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
76 typedef const f128& f128arg;
77 #else
78 typedef const f128 f128arg;
79 #endif
80 
81 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
82 typedef const f128& f128arg_ex;
83 #else
84 typedef const f128 f128arg_ex;
85 #endif
86 
87 #if !defined(_MSC_VER) && !defined(__vectorcall)
88 #define __vectorcall
89 #endif
90 
92  public:
93  static f128 __vectorcall SetValue(float v, each_float_tag) NLIB_NOEXCEPT;
94  static f128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
95  static f128 __vectorcall SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT;
96  template <size_t N>
97  static f128 __vectorcall SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT;
98  static f128 __vectorcall SetZero() NLIB_NOEXCEPT;
99  static f128 __vectorcall Set1000() NLIB_NOEXCEPT;
100  static f128 __vectorcall Set0100() NLIB_NOEXCEPT;
101  static f128 __vectorcall Set0010() NLIB_NOEXCEPT;
102  static f128 __vectorcall Set0001() NLIB_NOEXCEPT;
103  template <size_t N>
104  static f128 __vectorcall SetZeroToLane(f128arg value) NLIB_NOEXCEPT;
105  static f128 __vectorcall SetOne() NLIB_NOEXCEPT;
106  static f128 __vectorcall SetNegativeOne() NLIB_NOEXCEPT;
107  static f128 __vectorcall SetEpsilon() NLIB_NOEXCEPT;
108  static f128 __vectorcall SetInfinity() NLIB_NOEXCEPT;
109  static f128 __vectorcall SetNaN() NLIB_NOEXCEPT;
110  static f128 __vectorcall SetSignMask() NLIB_NOEXCEPT;
111 
112  static f128 __vectorcall LoadA16(const float* p) NLIB_NOEXCEPT;
113  static f128 __vectorcall LoadA8(const float* p) NLIB_NOEXCEPT;
114  static f128 __vectorcall LoadA4(const float* p) NLIB_NOEXCEPT;
115  static f128 __vectorcall LoadA16(uintptr_t p) NLIB_NOEXCEPT;
116  static f128 __vectorcall LoadA8(uintptr_t p) NLIB_NOEXCEPT;
117  static f128 __vectorcall LoadA4(uintptr_t p) NLIB_NOEXCEPT;
118  static f128 __vectorcall LoadA16(intptr_t p) NLIB_NOEXCEPT;
119  static f128 __vectorcall LoadA8(intptr_t p) NLIB_NOEXCEPT;
120  static f128 __vectorcall LoadA4(intptr_t p) NLIB_NOEXCEPT;
121 
122  static void __vectorcall StoreA16(float* p, f128arg value) NLIB_NOEXCEPT;
123  static void __vectorcall StoreA8(float* p, f128arg value) NLIB_NOEXCEPT;
124  static void __vectorcall StoreA4(float* p, f128arg value) NLIB_NOEXCEPT;
125  static void __vectorcall StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
126  static void __vectorcall StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
127  static void __vectorcall StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
128  static void __vectorcall StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT;
129  static void __vectorcall StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
130  static void __vectorcall StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
131 
132  static void __vectorcall StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT;
133  static void __vectorcall StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT;
134  static void __vectorcall StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
135  static void __vectorcall StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
136  static void __vectorcall StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
137  static void __vectorcall StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
138 
139  static void __vectorcall StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT;
140  static void __vectorcall StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT;
141  static void __vectorcall StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
142  static void __vectorcall StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
143  static void __vectorcall StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
144  static void __vectorcall StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
145 
146  /*
147  static f128 __vectorcall ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT;
148  static f128 __vectorcall ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT;
149  static f128 __vectorcall ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT;
150  */
151 
152 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
153  static f128 __vectorcall ConvertFromI128(i128 value) NLIB_NOEXCEPT;
154  static i128 __vectorcall ConvertToI128Round(f128 value) NLIB_NOEXCEPT;
155  static i128 __vectorcall ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT;
156 
157  static f128 __vectorcall CastFromI128(i128 value) NLIB_NOEXCEPT;
158  static i128 __vectorcall CastToI128(f128 value) NLIB_NOEXCEPT;
159 
160  template<int N>
161  static f128 __vectorcall ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT;
162  template<int N>
163  static i128 __vectorcall ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT;
164 #endif
165 
166  static f128 __vectorcall Add(f128arg a, f128arg b) NLIB_NOEXCEPT;
167  static f128 __vectorcall Sub(f128arg a, f128arg b) NLIB_NOEXCEPT;
168  static f128 __vectorcall Mult(f128arg a, f128arg b) NLIB_NOEXCEPT;
169  static f128 __vectorcall Mult(float a, f128arg b) NLIB_NOEXCEPT;
170  template <size_t N>
171  static f128 __vectorcall Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT;
172  static f128 __vectorcall Div(f128arg a, f128arg b) NLIB_NOEXCEPT;
173  static f128 __vectorcall Negate(f128arg value) NLIB_NOEXCEPT;
174  template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
175  static f128 __vectorcall NegateEx(f128arg value) NLIB_NOEXCEPT;
176  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
177  static f128 __vectorcall MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
178  template <size_t N>
179  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
181  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
182  static f128 __vectorcall MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
183  template <size_t N>
184  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
186  static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT;
187  static f128 __vectorcall Abs(f128arg value) NLIB_NOEXCEPT;
188  static f128 __vectorcall AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT;
189 
190  //
191  // Min/Max
192  //
193 
194  static f128 __vectorcall Max(f128arg a, f128arg b) NLIB_NOEXCEPT;
195  static f128 __vectorcall Min(f128arg a, f128arg b) NLIB_NOEXCEPT;
196  static f128 __vectorcall PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT;
197  static f128 __vectorcall PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT;
198  static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT;
199  static f128 __vectorcall Saturate(f128arg value) NLIB_NOEXCEPT;
200 
201  //
202  // Reciprocal/Sqrt
203  //
204 
205  static f128 __vectorcall Recp(f128arg value) NLIB_NOEXCEPT;
206  static f128 __vectorcall RecpEst(f128arg value) NLIB_NOEXCEPT;
207  static f128 __vectorcall Sqrt(f128arg value) NLIB_NOEXCEPT;
208  static f128 __vectorcall SqrtEst(f128arg value) NLIB_NOEXCEPT;
209  static f128 __vectorcall RecpSqrt(f128arg value) NLIB_NOEXCEPT;
210  static f128 __vectorcall RecpSqrtEst(f128arg value) NLIB_NOEXCEPT;
211 
212  //
213  // Round/Truncate
214  //
215 
216  static f128 __vectorcall Round(f128arg value) NLIB_NOEXCEPT;
217  static f128 __vectorcall Truncate(f128arg value) NLIB_NOEXCEPT;
218  static f128 __vectorcall Floor(f128arg value) NLIB_NOEXCEPT;
219  static f128 __vectorcall Ceil(f128arg value) NLIB_NOEXCEPT;
220 
221  //
222  // Logical Operations
223  //
224 
225  static f128 __vectorcall And(f128arg a, f128arg b) NLIB_NOEXCEPT;
226  static f128 __vectorcall Or(f128arg a, f128arg b) NLIB_NOEXCEPT;
227  static f128 __vectorcall Xor(f128arg a, f128arg b) NLIB_NOEXCEPT;
228  static f128 __vectorcall Not(f128arg a) NLIB_NOEXCEPT;
229  static f128 __vectorcall AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
230  static f128 __vectorcall OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
231 
232  //
233  // Comparison Operations
234  //
235 
236  static f128 __vectorcall CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT;
237  static f128 __vectorcall CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT;
238  static f128 __vectorcall CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT;
239  static f128 __vectorcall CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT;
240  static f128 __vectorcall CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT;
241  static f128 __vectorcall CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT;
242  static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT;
243  static f128 __vectorcall InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT;
244 
245  static f128 __vectorcall CmpEqZero(f128arg value) NLIB_NOEXCEPT;
246  static f128 __vectorcall CmpLtZero(f128arg value) NLIB_NOEXCEPT;
247  static f128 __vectorcall CmpLeZero(f128arg value) NLIB_NOEXCEPT;
248  static f128 __vectorcall CmpGtZero(f128arg value) NLIB_NOEXCEPT;
249  static f128 __vectorcall CmpGeZero(f128arg value) NLIB_NOEXCEPT;
250  static f128 __vectorcall CmpNeZero(f128arg value) NLIB_NOEXCEPT;
251  static f128 __vectorcall CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT;
252 
253  //
254  // Trigonometric function
255  //
256  static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
257  static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
258  static f128 __vectorcall ModAngle(f128arg value) NLIB_NOEXCEPT;
259  static f128 __vectorcall Sin(f128arg value) NLIB_NOEXCEPT;
260  static f128 __vectorcall Cos(f128arg value) NLIB_NOEXCEPT;
261  static f128x2 __vectorcall SinCos(f128arg value) NLIB_NOEXCEPT;
262  static f128 __vectorcall Tan(f128arg value) NLIB_NOEXCEPT;
263  static f128 __vectorcall SinH(f128arg value) NLIB_NOEXCEPT;
264  static f128 __vectorcall CosH(f128arg value) NLIB_NOEXCEPT;
265  static f128 __vectorcall TanH(f128arg value) NLIB_NOEXCEPT;
266  static f128 __vectorcall ArcSin(f128arg value) NLIB_NOEXCEPT;
267  static f128 __vectorcall ArcCos(f128arg value) NLIB_NOEXCEPT;
268  static f128 __vectorcall ArcTan(f128arg value) NLIB_NOEXCEPT;
269  static f128 __vectorcall ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT;
270  // and Est version needed?
271 
272  //
273  // Interpolation
274  //
275 
276  static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT;
277  static f128 __vectorcall
278  Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t) NLIB_NOEXCEPT;
279  static f128 __vectorcall
280  CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t) NLIB_NOEXCEPT;
281  static f128 __vectorcall
282  BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g) NLIB_NOEXCEPT;
283 
284  //
285  // Exp/Log
286  //
287  static f128 __vectorcall Exp2(f128arg value) NLIB_NOEXCEPT;
288  static f128 __vectorcall ExpE(f128arg value) NLIB_NOEXCEPT;
289  static f128 __vectorcall Log2(f128arg value) NLIB_NOEXCEPT;
290  static f128 __vectorcall LogE(f128arg value) NLIB_NOEXCEPT; // not implemented
291 
292  //
293  // Misc
294  //
295 
296  static int __vectorcall MoveMask(f128arg value) NLIB_NOEXCEPT;
297  static bool __vectorcall IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT;
298  static bool __vectorcall IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT;
299  static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT;
300  static f128 __vectorcall IsNaN(f128arg value) NLIB_NOEXCEPT;
301  static f128 __vectorcall IsInfinite(f128arg value) NLIB_NOEXCEPT;
302 
303  //
304  // Get/Set
305  //
306 
307  template <size_t N>
308  static float __vectorcall GetFloatFromLane(f128arg value) NLIB_NOEXCEPT;
309  template <size_t N>
310  static uint32_t __vectorcall GetUint32FromLane(f128arg value) NLIB_NOEXCEPT;
311  static float __vectorcall GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
312  static uint32_t __vectorcall GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
313 
314  template <size_t N>
315  static f128 __vectorcall SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT;
316  static f128 __vectorcall SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT;
317 
318  //
319  // Swizzle/Permute
320  //
321 
322  template <int V0, int V1, int V2, int V3>
323  static f128 __vectorcall Swizzle(f128arg value) NLIB_NOEXCEPT;
324  template <int V0, int V1, int V2, int V3>
325  static f128 __vectorcall Permute(f128arg a, f128arg b) NLIB_NOEXCEPT;
326  template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
327  static f128 __vectorcall Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT;
328 
329  template <size_t N>
330  // left <- value[3], value[2], value[1], value[0] -> right
331  static f128 __vectorcall RotateLeft(f128arg value) NLIB_NOEXCEPT {
332  NLIB_STATIC_ASSERT(N < 4);
333  const size_t NN = 4 - N;
334  return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
335  }
336  template <size_t N>
337  // left <- value[3], value[2], value[1], value[0] -> right
338  static f128 __vectorcall RotateRight(f128arg value) NLIB_NOEXCEPT {
339  NLIB_STATIC_ASSERT(N < 4);
340  return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
341  }
342  template <size_t N>
343  // left <- b[3], ..., b[0], a[3], ..., a[0] -> right
344  static f128 __vectorcall ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
345  NLIB_STATIC_ASSERT(N < 4);
346  return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
347  }
348 
349  private:
350  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v1000_[4]; // 1.f, 0.f, 0.f, 0.f
351  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0100_[4]; // 0.f, 1.f, 0.f, 0.f
352  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0010_[4]; // 0.f, 0.f, 1.f, 0.f
353  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0001_[4]; // 0.f, 0.f, 0.f, 1.f
354 
355  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float pi_values_[4]; // pi, -pi, 2pi, pi/2
356  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const uint32_t nan_inf_[4]; // nan, inf, -nan, -inf
357 
358  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R0_[4];
359  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R1_[4];
360 
361  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R0_[4];
362  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R1_[4];
363  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R2_[4];
364 
365  // Those coefficients are from Cephes Math Library
366  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_cvalue_[4];
367  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_coeff_[4];
368  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_cvalue_[4];
369  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_coeff_[4];
370  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan_coeff_[8];
371  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan2_cvalue_[4];
372 
373  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_P_[4];
374  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_Q_[4];
375  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tanh_cvalue_[4];
376  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_p_[4];
377  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_q_[4];
378  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_c_[4];
379  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float log2_PQ_[12];
380 
381  F128(); // forbidden
382  friend class Vector3;
383  friend class Vector4;
384  friend class Matrix;
385  friend class Plane;
386  friend class Quaternion;
387  friend class Sphere;
388  friend class AxisAlignedBox;
389  friend class OrientedBox;
390  friend class Frustum;
391 
392  friend class DistanceSq;
393  friend class Intersection;
394  friend class Containment;
395 };
396 
397 #ifndef NLIB_DOXYGEN
398 
399 #undef NLIB_M
400 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
401 #define NLIB_M2(tp) inline tp __vectorcall
402 
403 // r[i] = v
404 NLIB_M(f128) F128::SetValue(float v, each_float_tag) NLIB_NOEXCEPT {
405 #ifdef NLIB_F128_SIMD_NOUSE
406  f128 ret;
407  ret.vec.v[0] = v;
408  ret.vec.v[1] = v;
409  ret.vec.v[2] = v;
410  ret.vec.v[3] = v;
411  return ret;
412 #elif defined(NLIB_SSE41)
413  return _mm_set1_ps(v);
414 #elif defined(NLIB_NEON)
415  return vdupq_n_f32(v);
416 #elif defined(CAFE)
417  f128 ret;
418  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
419  return ret;
420 #endif
421 }
422 
423 // r[i] = *reinterpret_cast<float*>(&v)
424 NLIB_M(f128) F128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
425 #ifdef NLIB_F128_SIMD_NOUSE
426  f128 ret;
427  ret.vec.u[0] = v;
428  ret.vec.u[1] = v;
429  ret.vec.u[2] = v;
430  ret.vec.u[3] = v;
431  return ret;
432 #elif defined(NLIB_SSE41)
433  union {
434  float f32;
435  uint32_t u32;
436  } tmp;
437  tmp.u32 = v;
438  return _mm_set1_ps(tmp.f32);
439 #elif defined(NLIB_NEON)
440  uint32x4_t tmp = vdupq_n_u32(v);
441  return vreinterpretq_f32_u32(tmp);
442 #elif defined(CAFE)
443  union {
444  float f32;
445  uint32_t u32;
446  } tmp;
447  tmp.u32 = v;
448  f128 ret;
449  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
450  return ret;
451 #endif
452 }
453 
454 // r[0] = a, r[1] = b, r[2] = c, r[3] = d
455 NLIB_M(f128) F128::SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT {
456 #ifdef NLIB_F128_SIMD_NOUSE
457  f128 ret;
458  ret.vec.v[0] = a;
459  ret.vec.v[1] = b;
460  ret.vec.v[2] = c;
461  ret.vec.v[3] = d;
462  return ret;
463 #elif defined(NLIB_SSE41)
464  return _mm_set_ps(d, c, b, a);
465 #elif defined(NLIB_NEON)
466  union {
467  float f32[2];
468  uint64_t u64;
469  } tmp1, tmp2;
470  tmp1.f32[0] = a;
471  tmp1.f32[1] = b;
472  tmp2.f32[0] = c;
473  tmp2.f32[1] = d;
474  return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
475 #elif defined(CAFE)
476  f128 ret;
477  ret.vec.ps[0][0] = a;
478  ret.vec.ps[0][1] = b;
479  ret.vec.ps[1][0] = c;
480  ret.vec.ps[1][1] = d;
481  return ret;
482 #endif
483 }
484 
485 template <size_t N>
486 // r[i] = value[N]
487 NLIB_M(f128) F128::SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
488  NLIB_STATIC_ASSERT(N < 4);
489 #ifdef NLIB_F128_SIMD_NOUSE
490  f128 ret;
491  ret.vec.v[0] = value.vec.v[N];
492  ret.vec.v[1] = value.vec.v[N];
493  ret.vec.v[2] = value.vec.v[N];
494  ret.vec.v[3] = value.vec.v[N];
495  return ret;
496 #elif defined(NLIB_SSE41)
497  return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
498 #elif defined(NLIB_NEON)
499  float32x2_t tmp = vget_low_f32(value);
500  return vdupq_lane_f32(tmp, N);
501 #elif defined(CAFE)
502  f128 ret;
503  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
504  return ret;
505 #endif
506 }
507 
508 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
509 template <>
510 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
511  float32x2_t tmp = vget_high_f32(value);
512  return vdupq_lane_f32(tmp, 0);
513 }
514 template <>
515 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
516  float32x2_t tmp = vget_high_f32(value);
517  return vdupq_lane_f32(tmp, 1);
518 }
519 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
520 template <>
521 NLIB_M(f128) F128::SetValue<0>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
522  f128 ret;
523  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
524  return ret;
525 }
526 template <>
527 NLIB_M(f128) F128::SetValue<1>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
528  f128 ret;
529  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
530  return ret;
531 }
532 template <>
533 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
534  f128 ret;
535  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
536  return ret;
537 }
538 template <>
539 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
540  f128 ret;
541  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
542  return ret;
543 }
544 #endif
545 
546 // r[i] = 0.f
547 NLIB_M(f128) F128::SetZero() NLIB_NOEXCEPT {
548 #ifdef NLIB_F128_SIMD_NOUSE
549  f128 ret;
550  ret.vec.v[0] = 0;
551  ret.vec.v[1] = 0;
552  ret.vec.v[2] = 0;
553  ret.vec.v[3] = 0;
554  return ret;
555 #elif defined(NLIB_SSE41)
556  return _mm_setzero_ps();
557 #elif defined(NLIB_NEON)
558  return vdupq_n_f32(0);
559 #elif defined(CAFE)
560  f128 ret;
561  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
562  return ret;
563 #endif
564 }
565 
566 NLIB_M(f128) F128::Set1000() NLIB_NOEXCEPT {
567 #ifdef NLIB_F128_SIMD_NOUSE
568  f128 ret;
569  ret.vec.v[0] = 1.f;
570  ret.vec.v[1] = 0;
571  ret.vec.v[2] = 0;
572  ret.vec.v[3] = 0;
573  return ret;
574 #elif defined(NLIB_NEON)
575  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
576  float32x2_t x00 = vcreate_f32(0ULL);
577  return vcombine_f32(x10, x00);
578 #else
579  return F128::LoadA16(F128::v1000_);
580 #endif
581 }
582 
583 NLIB_M(f128) F128::Set0100() NLIB_NOEXCEPT {
584 #ifdef NLIB_F128_SIMD_NOUSE
585  f128 ret;
586  ret.vec.v[0] = 0;
587  ret.vec.v[1] = 1.f;
588  ret.vec.v[2] = 0;
589  ret.vec.v[3] = 0;
590  return ret;
591 #elif defined(NLIB_NEON)
592  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
593  float32x2_t x00 = vcreate_f32(0ULL);
594  return vcombine_f32(x01, x00);
595 #else
596  return F128::LoadA16(F128::v0100_);
597 #endif
598 }
599 
600 NLIB_M(f128) F128::Set0010() NLIB_NOEXCEPT {
601 #ifdef NLIB_F128_SIMD_NOUSE
602  f128 ret;
603  ret.vec.v[0] = 0;
604  ret.vec.v[1] = 0;
605  ret.vec.v[2] = 1.f;
606  ret.vec.v[3] = 0;
607  return ret;
608 #elif defined(NLIB_NEON)
609  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
610  float32x2_t x00 = vcreate_f32(0ULL);
611  return vcombine_f32(x00, x10);
612 #else
613  return F128::LoadA16(F128::v0010_);
614 #endif
615 }
616 
617 NLIB_M(f128) F128::Set0001() NLIB_NOEXCEPT {
618 #ifdef NLIB_F128_SIMD_NOUSE
619  f128 ret;
620  ret.vec.v[0] = 0;
621  ret.vec.v[1] = 0;
622  ret.vec.v[2] = 0;
623  ret.vec.v[3] = 1.f;
624  return ret;
625 #elif defined(NLIB_NEON)
626  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
627  float32x2_t x00 = vcreate_f32(0ULL);
628  return vcombine_f32(x00, x01);
629 #else
630  return F128::LoadA16(F128::v0001_);
631 #endif
632 }
633 
634 template <size_t N>
635 // r = value, r[N] = 0.f
636 NLIB_M(f128) F128::SetZeroToLane(f128arg value) NLIB_NOEXCEPT {
637  NLIB_STATIC_ASSERT(N < 4);
638 #ifdef NLIB_F128_SIMD_NOUSE
639  f128 ret = value;
640  ret.vec.v[N] = 0.f;
641  return ret;
642 #elif defined(NLIB_SSE41)
643  return _mm_insert_ps(value, value, 1 << N);
644 #elif defined(NLIB_NEON)
645  return F128::Permute<N == 0 ? 4 : 0,
646  N == 1 ? 5 : 1,
647  N == 2 ? 6 : 2,
648  N == 3 ? 7 : 3>(value, vdupq_n_f32(0.f));
649  // return vsetq_lane_f32(0.f, value, N);
650 #elif defined(CAFE)
651  f128 ret = value;
652  ret.vec.ps[N / 2][N % 2] = 0.f;
653  return ret;
654 #endif
655 }
656 
657 // r[i] = 1.f
658 NLIB_M(f128) F128::SetOne() NLIB_NOEXCEPT {
659  return F128::SetValue(1.f, each_float);
660 }
661 
662 // r[i] = -1.f
663 NLIB_M(f128) F128::SetNegativeOne() NLIB_NOEXCEPT {
664  return F128::SetValue(-1.f, each_float);
665 }
666 
667 // r[i] = 1.0e-7f
668 NLIB_M(f128) F128::SetEpsilon() NLIB_NOEXCEPT {
669  return F128::SetValue(1.0e-7f, each_float);
670 }
671 
672 // r[i] = 0x7F800000U
673 NLIB_M(f128) F128::SetInfinity() NLIB_NOEXCEPT {
674  return F128::SetValue(0x7F800000U, each_uint32);
675 }
676 
677 // r[i] = 0x7FC00000U
678 NLIB_M(f128) F128::SetNaN() NLIB_NOEXCEPT {
679  return F128::SetValue(0x7FC00000U, each_uint32);
680 }
681 
682 // r[i] = -0.f(0x80000000U)
683 NLIB_M(f128) F128::SetSignMask() NLIB_NOEXCEPT {
684  return F128::SetValue(-0.f, each_float);
685 }
686 
687 // r[i] = p[i], p is 16 bytes aligned
688 NLIB_M(f128) F128::LoadA16(const float* p) NLIB_NOEXCEPT {
689 #ifdef NLIB_F128_SIMD_NOUSE
690  f128 ret;
691  ret.vec.v[0] = p[0];
692  ret.vec.v[1] = p[1];
693  ret.vec.v[2] = p[2];
694  ret.vec.v[3] = p[3];
695  return ret;
696 #elif defined(NLIB_SSE41)
697  return _mm_load_ps(p);
698 #elif defined(NLIB_NEON)
699  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
700  uint64x2_t val = vld1q_u64(tmp);
701  return vreinterpretq_f32_u64(val);
702 #elif defined(CAFE)
703  f128 ret;
704  ret.vec.ps[0][0] = p[0];
705  ret.vec.ps[0][1] = p[1];
706  ret.vec.ps[1][0] = p[2];
707  ret.vec.ps[1][1] = p[3];
708  return ret;
709 #endif
710 }
711 
712 // r[i] = p[i], p is 4 bytes aligned
713 NLIB_M(f128) F128::LoadA4(const float* p) NLIB_NOEXCEPT {
714 #ifdef NLIB_F128_SIMD_NOUSE
715  return LoadA16(p);
716 #elif defined(NLIB_SSE41)
717  return _mm_loadu_ps(p);
718 #elif defined(NLIB_NEON)
719  return vld1q_f32(p);
720 #elif defined(CAFE)
721  f128 ret;
722  ret.vec.ps[0][0] = p[0];
723  ret.vec.ps[0][1] = p[1];
724  ret.vec.ps[1][0] = p[2];
725  ret.vec.ps[1][1] = p[3];
726  return ret;
727 #endif
728 }
729 
730 // r[i] = p[i], p is 8 bytes aligned
731 NLIB_M(f128) F128::LoadA8(const float* p) NLIB_NOEXCEPT {
732 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
733  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
734  uint64x2_t val = vld1q_u64(tmp);
735  return vreinterpretq_f32_u64(val);
736 #else
737  return LoadA4(p);
738 #endif
739 }
740 
741 // r[i] = p[i], p is 16 bytes aligned
742 NLIB_M(f128) F128::LoadA16(uintptr_t p) NLIB_NOEXCEPT {
743  return LoadA16(reinterpret_cast<const float*>(p));
744 }
745 
746 // r[i] = p[i], p is 8 bytes aligned
747 NLIB_M(f128) F128::LoadA8(uintptr_t p) NLIB_NOEXCEPT {
748  return LoadA8(reinterpret_cast<const float*>(p));
749 }
750 
751 // r[i] = p[i], p is 4 bytes aligned
752 NLIB_M(f128) F128::LoadA4(uintptr_t p) NLIB_NOEXCEPT {
753  return LoadA4(reinterpret_cast<const float*>(p));
754 }
755 
756 // r[i] = p[i], p is 16 bytes aligned
757 NLIB_M(f128) F128::LoadA16(intptr_t p) NLIB_NOEXCEPT {
758  return LoadA16(reinterpret_cast<const float*>(p));
759 }
760 
761 // r[i] = p[i], p is 8 bytes aligned
762 NLIB_M(f128) F128::LoadA8(intptr_t p) NLIB_NOEXCEPT {
763  return LoadA8(reinterpret_cast<const float*>(p));
764 }
765 
766 // r[i] = p[i], p is 4 bytes aligned
767 NLIB_M(f128) F128::LoadA4(intptr_t p) NLIB_NOEXCEPT {
768  return LoadA4(reinterpret_cast<const float*>(p));
769 }
770 
771 // p[i] = value[i], p is 16 bytes aligned
772 NLIB_M(void) F128::StoreA16(float* p, f128arg value) NLIB_NOEXCEPT {
773 #ifdef NLIB_F128_SIMD_NOUSE
774  p[0] = value.vec.v[0];
775  p[1] = value.vec.v[1];
776  p[2] = value.vec.v[2];
777  p[3] = value.vec.v[3];
778 #elif defined(NLIB_SSE41)
779  _mm_store_ps(p, value);
780 #elif defined(NLIB_NEON)
781  uint64x2_t tmp = vreinterpretq_u64_f32(value);
782  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
783 #elif defined(CAFE)
784  p[0] = value.vec.ps[0][0];
785  p[1] = value.vec.ps[0][1];
786  p[2] = value.vec.ps[1][0];
787  p[3] = value.vec.ps[1][1];
788 #endif
789 }
790 
791 // p[i] = value[i], p is 4 bytes aligned
792 NLIB_M(void) F128::StoreA4(float* p, f128arg value) NLIB_NOEXCEPT {
793 #ifdef NLIB_F128_SIMD_NOUSE
794  StoreA16(p, value);
795 #elif defined(NLIB_SSE41)
796  _mm_storeu_ps(p, value);
797 #elif defined(NLIB_NEON)
798  vst1q_f32(p, value);
799 #elif defined(CAFE)
800  p[0] = value.vec.ps[0][0];
801  p[1] = value.vec.ps[0][1];
802  p[2] = value.vec.ps[1][0];
803  p[3] = value.vec.ps[1][1];
804 #endif
805 }
806 
807 // p[i] = value[i], p is 8 bytes aligned
808 NLIB_M(void) F128::StoreA8(float* p, f128arg value) NLIB_NOEXCEPT {
809 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
810  uint64x2_t tmp = vreinterpretq_u64_f32(value);
811  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
812 #else
813  StoreA4(p, value);
814 #endif
815 }
816 
817 // p[i] = value[i], p is 16 bytes aligned
818 NLIB_M(void) F128::StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
819  StoreA16(reinterpret_cast<float*>(p), value);
820 }
821 
822 // p[i] = value[i], p is 8 bytes aligned
823 NLIB_M(void) F128::StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
824  StoreA8(reinterpret_cast<float*>(p), value);
825 }
826 
827 // p[i] = value[i], p is 4 bytes aligned
828 NLIB_M(void) F128::StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
829  StoreA4(reinterpret_cast<float*>(p), value);
830 }
831 
832 // p[i] = value[i], p is 16 bytes aligned
833 NLIB_M(void) F128::StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT {
834  StoreA16(reinterpret_cast<float*>(p), value);
835 }
836 
837 // p[i] = value[i], p is 8 bytes aligned
838 NLIB_M(void) F128::StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
839  StoreA8(reinterpret_cast<float*>(p), value);
840 }
841 
842 // p[i] = value[i], p is 4 bytes aligned
843 NLIB_M(void) F128::StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
844  StoreA4(reinterpret_cast<float*>(p), value);
845 }
846 
847 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
848 NLIB_M(void) F128::StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT {
849 #ifdef NLIB_F128_SIMD_NOUSE
850  p[0] = value.vec.v[0];
851  p[1] = value.vec.v[1];
852 #elif defined(NLIB_SSE41)
853  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
854 #elif defined(NLIB_NEON)
855  uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
856  vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
857 #elif defined(CAFE)
858  p[0] = value.vec.ps[0][0];
859  p[1] = value.vec.ps[0][1];
860 #endif
861 }
862 
863 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
864 NLIB_M(void) F128::StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT {
865 #ifdef NLIB_F128_SIMD_NOUSE
866  p[0] = value.vec.v[0];
867  p[1] = value.vec.v[1];
868 #elif defined(NLIB_SSE41)
869  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
870 #elif defined(NLIB_NEON)
871  float32x2_t tmp = vget_low_f32(value);
872  vst1_f32(p, tmp);
873 #elif defined(CAFE)
874  p[0] = value.vec.ps[0][0];
875  p[1] = value.vec.ps[0][1];
876 #endif
877 }
878 
879 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
880 NLIB_M(void) F128::StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
881  StoreLoA8(reinterpret_cast<float*>(p), value);
882 }
883 
884 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
885 NLIB_M(void) F128::StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
886  StoreLoA4(reinterpret_cast<float*>(p), value);
887 }
888 
889 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
890 NLIB_M(void) F128::StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
891  StoreLoA8(reinterpret_cast<float*>(p), value);
892 }
893 
894 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
895 NLIB_M(void) F128::StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
896  StoreLoA4(reinterpret_cast<float*>(p), value);
897 }
898 
899 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
900 NLIB_M(void) F128::StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT {
901 #ifdef NLIB_F128_SIMD_NOUSE
902  p[0] = value.vec.v[2];
903  p[1] = value.vec.v[3];
904 #elif defined(NLIB_SSE41)
905  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
906 #elif defined(NLIB_NEON)
907  vst1_f32(p, vget_high_f32(value));
908 #elif defined(CAFE)
909  p[0] = value.vec.ps[1][0];
910  p[1] = value.vec.ps[1][1];
911 #endif
912 }
913 
914 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
915 NLIB_M(void) F128::StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT {
916 #ifdef NLIB_F128_SIMD_NOUSE
917  p[0] = value.vec.v[2];
918  p[1] = value.vec.v[3];
919 #elif defined(NLIB_SSE41)
920  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
921 #elif defined(NLIB_NEON)
922  float32x2_t tmp = vget_high_f32(value);
923  vst1_f32(p, tmp);
924 #elif defined(CAFE)
925  p[0] = value.vec.ps[1][0];
926  p[1] = value.vec.ps[1][1];
927 #endif
928 }
929 
930 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
931 NLIB_M(void) F128::StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
932  StoreHiA8(reinterpret_cast<float*>(p), value);
933 }
934 
935 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
936 NLIB_M(void) F128::StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
937  StoreHiA4(reinterpret_cast<float*>(p), value);
938 }
939 
940 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
941 NLIB_M(void) F128::StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
942  StoreHiA8(reinterpret_cast<float*>(p), value);
943 }
944 
945 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
946 NLIB_M(void) F128::StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
947  StoreHiA4(reinterpret_cast<float*>(p), value);
948 }
949 
950 // r[i] = fabs(value[i])
951 NLIB_M(f128) F128::Abs(f128arg value) NLIB_NOEXCEPT {
952 #ifdef NLIB_F128_SIMD_NOUSE
953  f128 ret;
954  ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
955  ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
956  ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
957  ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
958  return ret;
959 #elif defined(NLIB_NEON)
960  return vabsq_f32(value);
961 #elif defined(NLIB_SSE41)
962  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
963  return _mm_andnot_ps(signmask, value);
964 #elif defined(CAFE)
965  f128 ret;
966  ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
967  ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
968  return ret;
969 #endif
970 }
971 
972 // r[i] = mask[i] ? a[i] : b[i]
973 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT {
974 #ifdef NLIB_F128_SIMD_NOUSE
975  f128 result;
976  result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
977  result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
978  result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
979  result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
980  return result;
981 #elif defined(NLIB_SSE41)
982  return _mm_blendv_ps(b, a, mask);
983 #elif defined(NLIB_NEON)
984  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
985 #elif defined(CAFE)
986  // avoid NaN
987  f128 mask_ = mask;
988  mask_.vec.u[0] &= 0xFF7FFFFFUL;
989  mask_.vec.u[1] &= 0xFF7FFFFFUL;
990  mask_.vec.u[2] &= 0xFF7FFFFFUL;
991  mask_.vec.u[3] &= 0xFF7FFFFFUL;
992  // mask_ < 0 ? a : b
993  f128 ret;
994  ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
995  ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
996  return ret;
997 #endif
998 }
999 
1000 /*
1001 NLIB_M(f128) F128::ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT {
1002 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1003  float16x4_t lo = vget_low_f16(vreinterpretq_f16_f32(value));
1004  return vcvt_f32_f16(lo);
1005 #else
1006  // _mm_cvtph_ps
1007  (void)value;
1008  return F128::SetZero();
1009 #endif
1010 }
1011 
1012 NLIB_M(f128) F128::ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT {
1013 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1014 #ifdef __aarch64__
1015  return vcvt_high_f32_f16(value);
1016 #else
1017  float16x4_t hi = vget_high_f16(vreinterpretq_f16_f32(value));
1018  return vcvt_f32_f16(hi);
1019 #endif
1020 #else
1021  // _mm_cvtph_ps
1022  (void)value;
1023  return F128::SetZero();
1024 #endif
1025 }
1026 
1027 NLIB_M(f128) F128::ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT {
1028 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1029 #ifdef __aarch64__
1030  float16x4_t lo = vcvt_f16_f32(a);
1031  float16x8_t x = vcvt_high_f16_f32(lo, b);
1032  return vreinterpretq_f32_f16(x);
1033 #else
1034  float16x4_t lo = vcvt_f16_f32(a);
1035  float16x4_t hi = vcvt_f16_f32(b);
1036  return vreinterpretq_f32_f16(vcombine_f16(lo, hi));
1037 #endif
1038 #else
1039  // _mm_cvtps_ph
1040  (void)a;
1041  (void)b;
1042  return F128::SetZero();
1043 #endif
1044 }
1045 */
1046 
1047 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
1048 // r[i] = static_cast<float>(value[i])
1049 NLIB_M(f128) F128::ConvertFromI128(i128 value) NLIB_NOEXCEPT {
1050 #if defined(NLIB_SSE41)
1051  return _mm_cvtepi32_ps(value);
1052 #elif defined(NLIB_NEON)
1053  return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1054 #endif
1055 }
1056 
1057 // r[i] = *reinterpret_cast<float*>(&value[i])
1058 NLIB_M(f128) F128::CastFromI128(i128 value) NLIB_NOEXCEPT {
1059 #if defined(NLIB_SSE41)
1060  return _mm_castsi128_ps(value);
1061 #elif defined(NLIB_NEON)
1062  return vreinterpretq_f32_s8(value);
1063 #endif
1064 }
1065 
1066 // r[i] = static_cast<int>(roundf(value[i]))
1067 NLIB_M(i128) F128::ConvertToI128Round(f128 value) NLIB_NOEXCEPT {
1068 #if defined(NLIB_SSE41)
1069  return _mm_cvtps_epi32(value);
1070 #elif defined(NLIB_NEON)
1071  uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1072  uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1073  uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1074  w = vorrq_u32(w, half);
1075  return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1076 #endif
1077 }
1078 
1079 NLIB_M(i128) F128::ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT {
1080 #if defined(NLIB_SSE41)
1081  return _mm_cvttps_epi32(value);
1082 #elif defined(NLIB_NEON)
1083  return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1084 #endif
1085 }
1086 
1087 // r[i] = *reinterpret_cast<int*>(&value[i])
1088 NLIB_M(i128) F128::CastToI128(f128 value) NLIB_NOEXCEPT {
1089 #if defined(NLIB_SSE41)
1090  return _mm_castps_si128(value);
1091 #elif defined(NLIB_NEON)
1092  return vreinterpretq_s8_f32(value);
1093 #endif
1094 }
1095 
1096 template<int N>
1097 NLIB_M(f128) F128::ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT {
1098  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1099 #if defined(NLIB_NEON)
1100  return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1101 #else
1102  f128 f = F128::ConvertFromI128(value);
1103  f128 m = F128::SetValue(((0x7F - N) << 23), each_uint32); // 0.5f^N
1104  return F128::Mult(f, m);
1105 #endif
1106 }
1107 
1108 template<int N>
1109 NLIB_M(i128) F128::ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT {
1110  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1111 #if defined(NLIB_NEON)
1112  return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1113 #else
1114  f128 m = F128::SetValue(((0x7F + N) << 23), each_uint32); // 2.f^N
1115  f128 f = F128::Mult(value, m);
1116  return F128::ConvertToI128Truncate(f);
1117 #endif
1118 }
1119 
1120 #endif
1121 
1122 // r[i] = (a[i] < b[i]) ? 0xFFFFFFFF : 0
1123 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1124 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1125  f128 ret;
1126  ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1127  ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1128  ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1129  ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1130  return ret;
1131 #elif defined(NLIB_SSE41)
1132  return _mm_cmplt_ps(a, b);
1133 #elif defined(NLIB_NEON)
1134  uint32x4_t tmp = vcltq_f32(a, b);
1135  return vreinterpretq_f32_u32(tmp);
1136 #endif
1137 }
1138 
1139 // r[i] = (a[i] <= b[i]) ? 0xFFFFFFFF : 0
1140 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1141 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1142  f128 ret;
1143  ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1144  ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1145  ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1146  ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1147  return ret;
1148 #elif defined(NLIB_SSE41)
1149  return _mm_cmple_ps(a, b);
1150 #elif defined(NLIB_NEON)
1151  uint32x4_t tmp = vcleq_f32(a, b);
1152  return vreinterpretq_f32_u32(tmp);
1153 #endif
1154 }
1155 
1156 // r[i] = (a[i] > b[i]) ? 0xFFFFFFFF : 0
1157 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1158 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1159  f128 ret;
1160  ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1161  ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1162  ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1163  ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1164  return ret;
1165 #elif defined(NLIB_SSE41)
1166  return _mm_cmpgt_ps(a, b);
1167 #elif defined(NLIB_NEON)
1168  uint32x4_t tmp = vcgtq_f32(a, b);
1169  return vreinterpretq_f32_u32(tmp);
1170 #endif
1171 }
1172 
1173 // r[i] = (a[i] >= b[i]) ? 0xFFFFFFFF : 0
1174 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1175 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1176  f128 ret;
1177  ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1178  ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1179  ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1180  ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1181  return ret;
1182 #elif defined(NLIB_SSE41)
1183  return _mm_cmpge_ps(a, b);
1184 #elif defined(NLIB_NEON)
1185  uint32x4_t tmp = vcgeq_f32(a, b);
1186  return vreinterpretq_f32_u32(tmp);
1187 #endif
1188 }
1189 
1190 // r[i] = (a[i] != b[i]) ? 0xFFFFFFFF : 0
1191 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1192 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1193  f128 ret;
1194  ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1195  ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1196  ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1197  ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1198  return ret;
1199 #elif defined(NLIB_SSE41)
1200  return _mm_cmpneq_ps(a, b);
1201 #elif defined(NLIB_NEON)
1202  uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1203  return vreinterpretq_f32_u32(tmp);
1204 #endif
1205 }
1206 
1207 // r[i] = a[i] + b[i]
1208 NLIB_M(f128) F128::Add(f128arg a, f128arg b) NLIB_NOEXCEPT {
1209 #ifdef NLIB_F128_SIMD_NOUSE
1210  f128 ret;
1211  ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1212  ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1213  ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1214  ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1215  return ret;
1216 #elif defined(NLIB_SSE41)
1217  return _mm_add_ps(a, b);
1218 #elif defined(NLIB_NEON)
1219  return vaddq_f32(a, b);
1220 #elif defined(CAFE)
1221  f128 ret;
1222  ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1223  ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1224  return ret;
1225 #endif
1226 }
1227 
1228 // r[i] = a[i] - b[i]
1229 NLIB_M(f128) F128::Sub(f128arg a, f128arg b) NLIB_NOEXCEPT {
1230 #ifdef NLIB_F128_SIMD_NOUSE
1231  f128 ret;
1232  ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1233  ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1234  ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1235  ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1236  return ret;
1237 #elif defined(NLIB_SSE41)
1238  return _mm_sub_ps(a, b);
1239 #elif defined(NLIB_NEON)
1240  return vsubq_f32(a, b);
1241 #elif defined(CAFE)
1242  f128 ret;
1243  ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1244  ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1245  return ret;
1246 #endif
1247 }
1248 
1249 // r[i] = -value[i]
1250 NLIB_M(f128) F128::Negate(f128arg value) NLIB_NOEXCEPT {
1251 #ifdef NLIB_F128_SIMD_NOUSE
1252  f128 ret;
1253  ret.vec.v[0] = -value.vec.v[0];
1254  ret.vec.v[1] = -value.vec.v[1];
1255  ret.vec.v[2] = -value.vec.v[2];
1256  ret.vec.v[3] = -value.vec.v[3];
1257  return ret;
1258 #elif defined(NLIB_NEON)
1259  return vnegq_f32(value);
1260 #elif defined(NLIB_SSE41)
1261  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
1262  return _mm_xor_ps(signmask, value);
1263 #elif defined(CAFE)
1264  f128 ret;
1265  ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1266  ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1267  return ret;
1268 #endif
1269 }
1270 
1271 // r[i] = a[i] * b[i]
1272 NLIB_M(f128) F128::Mult(f128arg a, f128arg b) NLIB_NOEXCEPT {
1273 #ifdef NLIB_F128_SIMD_NOUSE
1274  f128 ret;
1275  ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1276  ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1277  ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1278  ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1279  return ret;
1280 #elif defined(NLIB_SSE41)
1281  return _mm_mul_ps(a, b);
1282 #elif defined(NLIB_NEON)
1283  return vmulq_f32(a, b);
1284 #elif defined(CAFE)
1285  f128 ret;
1286  ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1287  ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1288  return ret;
1289 #endif
1290 }
1291 
1292 // r[i] = a * b[i]
1293 NLIB_M(f128) F128::Mult(float a, f128arg b) NLIB_NOEXCEPT {
1294 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1295  return vmulq_n_f32(b, a);
1296 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1297  f128 ret;
1298  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1299  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1300  return ret;
1301 #else
1302  return F128::Mult(b, F128::SetValue(a, each_float));
1303 #endif
1304 }
1305 
1306 template <size_t N>
1307 // r[i] = a[N] * b[i]
1308 NLIB_M(f128) F128::Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT {
1309 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1310 # if __aarch64__
1311  return vmulq_laneq_f32(b, a, N);
1312 # else
1313  float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1314  return vmulq_n_f32(b, tmp);
1315 # endif
1316 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1317  float t = a.vec.ps[N / 2][N % 2];
1318  f128 ret;
1319  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1320  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1321  return ret;
1322 #else
1323  return F128::Mult(F128::SetValue<N>(a, each_select32), b);
1324 #endif
1325 }
1326 
1327 // r[i] = a[i] / b[i]
1328 NLIB_M(f128) F128::Div(f128arg a, f128arg b) NLIB_NOEXCEPT {
1329 #ifdef NLIB_F128_SIMD_NOUSE
1330  f128 ret;
1331  ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1332  ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1333  ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1334  ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1335  return ret;
1336 #elif defined(NLIB_SSE41)
1337  return _mm_div_ps(a, b);
1338 #elif defined(NLIB_NEON)
1339 # ifdef __aarch64__
1340  return vdivq_f32(a, b);
1341 # else
1342  float32x4_t inv0 = vrecpeq_f32(b);
1343  float32x4_t step0 = vrecpsq_f32(inv0, b);
1344  float32x4_t inv1 = vmulq_f32(step0, inv0);
1345  float32x4_t step1 = vrecpsq_f32(inv1, b);
1346  float32x4_t inv2 = vmulq_f32(step1, inv1);
1347  uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1348  inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1349  return vmulq_f32(a, inv2);
1350 # endif
1351 #elif defined(CAFE)
1352  f128 ret;
1353  ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1354  ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1355  return ret;
1356 #endif
1357 }
1358 
1359 // r[i] = max(a[i], b[i])
1360 NLIB_M(f128) F128::Max(f128arg a, f128arg b) NLIB_NOEXCEPT {
1361 #ifdef NLIB_F128_SIMD_NOUSE
1362  f128 ret;
1363  ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1364  ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1365  ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1366  ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1367  return ret;
1368 #elif defined(NLIB_SSE41)
1369  return _mm_max_ps(a, b);
1370 #elif defined(NLIB_NEON)
1371  return vmaxq_f32(a, b);
1372 #elif defined(CAFE)
1373  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1374  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1375  f128 ret;
1376  ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1377  ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1378  return ret;
1379 #endif
1380 }
1381 
1382 // r[i] = min(a[i], b[i])
1383 NLIB_M(f128) F128::Min(f128arg a, f128arg b) NLIB_NOEXCEPT {
1384 #ifdef NLIB_F128_SIMD_NOUSE
1385  f128 ret;
1386  ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1387  ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1388  ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1389  ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1390  return ret;
1391 #elif defined(NLIB_SSE41)
1392  return _mm_min_ps(a, b);
1393 #elif defined(NLIB_NEON)
1394  return vminq_f32(a, b);
1395 #elif defined(CAFE)
1396  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1397  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1398  f128 ret;
1399  ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1400  ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1401  return ret;
1402 #endif
1403 }
1404 
1405 // r[0] = max(a[0], a[1]), r[1] = max(a[2], a[3]), r[2] = max(b[0], b[1]), r[3] = max(b[2], b[3])
1406 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT {
1407 #ifdef NLIB_F128_SIMD_NOUSE
1408  f128 ret;
1409  ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1410  ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1411  ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1412  ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1413  return ret;
1414 #elif defined(NLIB_SSE41)
1415  f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1416  f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1417  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1418 #elif defined(NLIB_NEON)
1419 # ifdef __aarch64__
1420  return vpmaxq_f32(a, b);
1421 # else
1422  float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1423  float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1424  return vcombine_f32(rl, rh);
1425 # endif
1426 #elif defined(CAFE)
1427  f32x2 v02, v13, cmp;
1428  f128 ret;
1429  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1430  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1431  cmp = __PS_SUB(v02, v13);
1432  ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1433  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1434  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1435  cmp = __PS_SUB(v02, v13);
1436  ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1437  return ret;
1438 #endif
1439 }
1440 
1441 // r[0] = min(a[0], a[1]), r[1] = min(a[2], a[3]), r[2] = min(b[0], b[1]), r[3] = min(b[2], b[3])
1442 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT {
1443 #ifdef NLIB_F128_SIMD_NOUSE
1444  f128 ret;
1445  ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1446  ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1447  ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1448  ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1449  return ret;
1450 #elif defined(NLIB_SSE41)
1451  f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1452  f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1453  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1454 #elif defined(NLIB_NEON)
1455 # ifdef __aarch64__
1456  return vpminq_f32(a, b);
1457 # else
1458  float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1459  float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1460  return vcombine_f32(rl, rh);
1461 # endif
1462 #elif defined(CAFE)
1463  f32x2 v02, v13, cmp;
1464  f128 ret;
1465  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1466  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1467  cmp = __PS_SUB(v02, v13);
1468  ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1469  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1470  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1471  cmp = __PS_SUB(v02, v13);
1472  ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1473  return ret;
1474 #endif
1475 }
1476 
1477 // r[0] = a[0] + a[1], r[1] = a[2] + a[3], ...
1478 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT {
1479 #ifdef NLIB_F128_SIMD_NOUSE
1480  f128 ret;
1481  ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1482  ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1483  ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1484  ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1485  return ret;
1486 #elif defined(NLIB_SSE41)
1487  return _mm_hadd_ps(a, b);
1488 #elif defined(NLIB_NEON)
1489 # ifdef __aarch64__
1490  return vpaddq_f32(a, b);
1491 # else
1492  float32x2_t al = vget_low_f32(a);
1493  float32x2_t ah = vget_high_f32(a);
1494  float32x2_t l = vpadd_f32(al, ah);
1495 
1496  float32x2_t bl = vget_low_f32(b);
1497  float32x2_t bh = vget_high_f32(b);
1498  float32x2_t h = vpadd_f32(bl, bh);
1499  return vcombine_f32(l, h);
1500 # endif
1501 #elif defined(CAFE)
1502  f32x2 v02, v13, cmp;
1503  f128 ret;
1504  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1505  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1506  ret.vec.ps[0] = __PS_ADD(v02, v13);
1507  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1508  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1509  ret.vec.ps[1] = __PS_ADD(v02, v13);
1510  return ret;
1511 #endif
1512 }
1513 
1514 // r[i] = fabs(a[i] - b[i])
1515 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT {
1516 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1517  return vabdq_f32(a, b);
1518 #else
1519  return F128::Abs(F128::Sub(a, b));
1520 #endif
1521 }
1522 
1523 // r[i] = c[i] + a[i] * b[i]
1524 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1525 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1526 # if __aarch64__
1527  return vfmaq_f32(c, a, b);
1528 # else
1529  return vmlaq_f32(c, a, b);
1530 # endif
1531 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1532  f128 ret;
1533  ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1534  ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1535  return ret;
1536 #else
1537  return F128::Add(c, F128::Mult(a, b));
1538 #endif
1539 }
1540 
1541 // r[i] = c[i] + a * b[i]
1542 NLIB_M(f128) F128::MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1543 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1544 # if __aarch64__
1545  return vfmaq_n_f32(c, b, a);
1546 # else
1547  return vmlaq_n_f32(c, b, a);
1548 # endif
1549 #else
1550  return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1551 #endif
1552 }
1553 
1554 template <size_t N>
1555 // r[i] = c[i] + a[N] * b[i]
1556 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1558  NLIB_STATIC_ASSERT(N < 4);
1559 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1560 # if __aarch64__
1561  return vfmaq_laneq_f32(c, b, a, N);
1562 # else
1563  return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1564 # endif
1565 #else
1566  return F128::MultAdd(F128::SetValue<N>(a, each_select32), b, c);
1567 #endif
1568 }
1569 
1570 // r[i] = c[i] - a[i] * b[i]
1571 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1572 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1573 # if __aarch64__
1574  return vfmsq_f32(c, a, b);
1575 # else
1576  return vmlsq_f32(c, a, b);
1577 # endif
1578 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1579  f128 ret;
1580  ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1581  ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1582  return ret;
1583 #else
1584  return F128::Sub(c, F128::Mult(a, b));
1585 #endif
1586 }
1587 
1588 // r[i] = c[i] - a * b[i]
1589 NLIB_M(f128) F128::MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1590 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1591 # if __aarch64__
1592  return vfmsq_n_f32(c, b, a);
1593 # else
1594  return vmlsq_n_f32(c, b, a);
1595 # endif
1596 #else
1597  return F128::MultSub(F128::SetValue(a, each_float), b, c);
1598 #endif
1599 }
1600 
1601 template <size_t N>
1602 // r[i] = c[i] - a[N] * b[i]
1603 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1605  NLIB_STATIC_ASSERT(N < 4);
1606 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1607 # ifdef __arch64__
1608  return vfmsq_laneq_f32(c, b, a, N);
1609 # else
1610  return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1611 # endif
1612 #else
1613  return F128::MultSub(F128::SetValue<N>(a, each_select32), b, c);
1614 #endif
1615 }
1616 
1617 // r[i] = a[i] + t[i] * (b[i] - a[i])
1618 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT {
1619  // a + t * (b - a)
1620  return F128::MultAdd(t, F128::Sub(b, a), a);
1621 }
1622 
1623 // r[i] = a[i] & b[i]
1624 NLIB_M(f128) F128::And(f128arg a, f128arg b) NLIB_NOEXCEPT {
1625 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1626  f128 ret;
1627  ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1628  ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1629  ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1630  ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1631  return ret;
1632 #elif defined(NLIB_SSE41)
1633  return _mm_and_ps(a, b);
1634 #elif defined(NLIB_NEON)
1635  uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1636  return vreinterpretq_f32_u32(tmp);
1637 #endif
1638 }
1639 
1640 // -pi <= angle1, angle2, r < pi
1641 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1642  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1643  // -pi <= ret < pi
1644  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1645  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1646 
1647  f128 sum = F128::Add(angle1, angle2);
1648  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1649  f128 ofs = F128::And(cond, pi_dbl);
1650  f128 result = F128::Add(sum, ofs);
1651  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1652  ofs = F128::And(cond, pi_dbl);
1653  return F128::Sub(result, ofs);
1654 }
1655 
1656 // -pi <= angle1, angle2, r < pi
1657 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1658  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1659  // -pi <= ret < pi
1660  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1661  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1662 
1663  f128 sum = F128::Sub(angle1, angle2);
1664  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1665  f128 ofs = F128::And(cond, pi_dbl);
1666  f128 result = F128::Add(sum, ofs);
1667  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1668  ofs = F128::And(cond, pi_dbl);
1669  return F128::Sub(result, ofs);
1670 }
1671 
1672 // ( 2 1 -2 1) (p0)
1673 // (t^3 t^2 t 1) (-3 -2 3 -1) (v0)
1674 // ( 0 1 0 0) (p1)
1675 // ( 1 0 0 0) (v1)
1676 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1677  f128arg_ex t) NLIB_NOEXCEPT {
1678  // (2 * p0 + v0 - 2 * p1 + v1) * t^3 + (-3 * p0 - 2 * v0 + 3 * p1 - v1) * t^2
1679  // + v0 * t + p0
1680  // ==
1681  // (2 * t^3 - 3 * t^2 + 1) * p0 + (t^3 - 2 * t^2 + t) * v0
1682  // + (-2 * t^3 + 3 * t^2) * p1 + (t^3 - t^2) * v1
1683  f128 tt = F128::Mult(t, t);
1684  f128 ttt = F128::Mult(tt, t);
1685 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1686  f128 hermite_R0 = vcombine_f32(vcreate_f32(0x3F80000040000000ULL),
1687  vcreate_f32(0x3F800000C0000000ULL));
1688  f128 hermite_R1 = vcombine_f32(vcreate_f32(0xC0000000C0400000ULL),
1689  vcreate_f32(0xBF80000040400000ULL));
1690 #else
1691  f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1692  f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1693 #endif
1694 
1695  ttt = F128::Mult(ttt, hermite_R0);
1696  ttt = F128::MultAdd(tt, hermite_R1, ttt);
1697  ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1698  ttt = F128::Add(ttt, F128::Set1000());
1699 
1700  // vec(ttt) * mtx(p0, v0, p1, v1)
1701  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1702  result = F128::MultAdd<1>(ttt, v0, result, each_select32);
1703  result = F128::MultAdd<2>(ttt, p1, result, each_select32);
1704  result = F128::MultAdd<3>(ttt, v1, result, each_select32);
1705  return result;
1706 }
1707 
1708 // (-1 3 -3 1) (p0)
1709 // 0.5 * (t^3 t^2 t 1) ( 2 -5 4 -1) (p1)
1710 // (-1 0 1 0) (p2)
1711 // ( 0 2 0 0) (p3)
1712 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1713  f128arg_ex t) NLIB_NOEXCEPT {
1714  f128 tt = F128::Mult(t, t);
1715  f128 ttt = F128::Mult(tt, t);
1716 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1717  f128 catmull_R0 = vcombine_f32(vcreate_f32(0x40400000BF800000ULL),
1718  vcreate_f32(0x3F800000C0400000ULL));
1719  f128 catmull_R1 = vcombine_f32(vcreate_f32(0xC0A0000040000000ULL),
1720  vcreate_f32(0xBF80000040800000ULL));
1721  f128 catmull_R2 = vcombine_f32(vcreate_f32(0x00000000BF800000ULL),
1722  vcreate_f32(0x000000003F800000ULL));
1723 #else
1724  f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1725  f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1726  f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1727 #endif
1728  ttt = F128::Mult(ttt, catmull_R0);
1729  ttt = F128::MultAdd(tt, catmull_R1, ttt);
1730  ttt = F128::MultAdd(t, catmull_R2, ttt);
1731  ttt = F128::Add(ttt, F128::Set0100());
1732 
1733  // vec(ttt) * mtx(p0, p1, p2, p3)
1734  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1735  result = F128::MultAdd<1>(ttt, p1, result, each_select32);
1736  result = F128::MultAdd<2>(ttt, p2, result, each_select32);
1737  result = F128::MultAdd<3>(ttt, p3, result, each_select32);
1738 
1739  return result;
1740 }
1741 
1742 // p0 + f * (p1 - p0) + g * (p2 - p0)
1743 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1744  f128arg_ex g) NLIB_NOEXCEPT {
1745  f128 p1p0 = F128::Sub(p1, p0);
1746  f128 p2p0 = F128::Sub(p2, p0);
1747  f128 tmp = F128::MultAdd(f, p1p0, p0);
1748  return F128::MultAdd(g, p2p0, tmp);
1749 }
1750 
1751 // r[i] = a[i] | b[i]
1752 NLIB_M(f128) F128::Or(f128arg a, f128arg b) NLIB_NOEXCEPT {
1753 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1754  f128 ret;
1755  ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1756  ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1757  ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1758  ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1759  return ret;
1760 #elif defined(NLIB_SSE41)
1761  return _mm_or_ps(a, b);
1762 #elif defined(NLIB_NEON)
1763  uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1764  return vreinterpretq_f32_u32(tmp);
1765 #endif
1766 }
1767 
1768 // r[i] = a[i] ^ b[i]
1769 NLIB_M(f128) F128::Xor(f128arg a, f128arg b) NLIB_NOEXCEPT {
1770 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1771  f128 ret;
1772  ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1773  ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1774  ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1775  ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1776  return ret;
1777 #elif defined(NLIB_SSE41)
1778  return _mm_xor_ps(a, b);
1779 #elif defined(NLIB_NEON)
1780  uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1781  return vreinterpretq_f32_u32(tmp);
1782 #endif
1783 }
1784 
1785 // r[i] = ~a[i]
1786 NLIB_M(f128) F128::Not(f128arg a) NLIB_NOEXCEPT {
1787 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1788  f128 ret;
1789  ret.vec.u[0] = ~a.vec.u[0];
1790  ret.vec.u[1] = ~a.vec.u[1];
1791  ret.vec.u[2] = ~a.vec.u[2];
1792  ret.vec.u[3] = ~a.vec.u[3];
1793  return ret;
1794 #elif defined(NLIB_SSE41)
1795  return _mm_andnot_ps(a, F128::CmpEq(a, a));
1796 #elif defined(NLIB_NEON)
1797  uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1798  return vreinterpretq_f32_u32(tmp);
1799 #endif
1800 }
1801 
1802 // r[i] = ~a[i] & b[i]
1803 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1804 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1805  f128 ret;
1806  ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1807  ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1808  ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1809  ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1810  return ret;
1811 #elif defined(NLIB_SSE41)
1812  return _mm_andnot_ps(a, b);
1813 #elif defined(NLIB_NEON)
1814  uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1815  return vreinterpretq_f32_u32(tmp);
1816 #endif
1817 }
1818 
1819 // r[i] = ~a[i] | b[i]
1820 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1821 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1822  f128 ret;
1823  ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1824  ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1825  ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1826  ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1827  return ret;
1828 #elif defined(NLIB_SSE41)
1829  return _mm_or_ps(F128::Not(a), b);
1830 #elif defined(NLIB_NEON)
1831  uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1832  return vreinterpretq_f32_u32(tmp);
1833 #endif
1834 }
1835 
1836 // r[i] = (a[i] == b[i]) ? 0xFFFFFFFF : 0
1837 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT {
1838 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1839  f128 ret;
1840  ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1841  ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1842  ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1843  ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1844  return ret;
1845 #elif defined(NLIB_SSE41)
1846  return _mm_cmpeq_ps(a, b);
1847 #elif defined(NLIB_NEON)
1848  uint32x4_t tmp = vceqq_f32(a, b);
1849  return vreinterpretq_f32_u32(tmp);
1850 #endif
1851 }
1852 
1853 // r[i] = (absf(a[i] - b[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1854 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT {
1855  f128 tmp = F128::AbsDiff(a, b);
1856  return F128::CmpLe(tmp, eps);
1857 }
1858 
1859 // r[i] = clamp(value[i], min[i], max[i])
1860 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT {
1861  return F128::Min(max, F128::Max(min, value));
1862 }
1863 
1864 // r[i] = absf(value[i]) <= bounds[i] ? 0xFFFFFFFF : 0
1865 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT {
1866 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1867  uint32x4_t tmp = vcaleq_f32(value, bounds);
1868  return vreinterpretq_f32_u32(tmp);
1869 #else
1870  return F128::CmpLe(F128::Abs(value), bounds);
1871 #endif
1872 }
1873 
1874 NLIB_M(f128) F128::CmpEqZero(f128arg value) NLIB_NOEXCEPT {
1875 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1876  return vreinterpretq_f32_u32(vceqzq_f32(value));
1877 #else
1878  return F128::CmpEq(value, F128::SetZero());
1879 #endif
1880 }
1881 
1882 NLIB_M(f128) F128::CmpLtZero(f128arg value) NLIB_NOEXCEPT {
1883 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1884  return vreinterpretq_f32_u32(vcltzq_f32(value));
1885 #else
1886  return F128::CmpLt(value, F128::SetZero());
1887 #endif
1888 }
1889 
1890 NLIB_M(f128) F128::CmpLeZero(f128arg value) NLIB_NOEXCEPT {
1891 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1892  return vreinterpretq_f32_u32(vclezq_f32(value));
1893 #else
1894  return F128::CmpLe(value, F128::SetZero());
1895 #endif
1896 }
1897 
1898 NLIB_M(f128) F128::CmpGtZero(f128arg value) NLIB_NOEXCEPT {
1899 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1900  return vreinterpretq_f32_u32(vcgtzq_f32(value));
1901 #else
1902  return F128::CmpGt(value, F128::SetZero());
1903 #endif
1904 }
1905 
1906 NLIB_M(f128) F128::CmpGeZero(f128arg value) NLIB_NOEXCEPT {
1907 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1908  return vreinterpretq_f32_u32(vcgezq_f32(value));
1909 #else
1910  return F128::CmpGe(value, F128::SetZero());
1911 #endif
1912 }
1913 
1914 NLIB_M(f128) F128::CmpNeZero(f128arg value) NLIB_NOEXCEPT {
1915 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1916  return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1917 #else
1918  return F128::CmpNe(value, F128::SetZero());
1919 #endif
1920 }
1921 
1922 // r[i] = (absf(value[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1923 NLIB_M(f128) F128::CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT {
1924  f128 tmp = F128::Abs(value);
1925  return F128::CmpLe(tmp, eps);
1926 }
1927 
1928 // r[i] = 1.f / value[i] with higher precision, set infinity if value[i] == 0
1929 NLIB_M2(f128) F128::Recp(f128arg value) NLIB_NOEXCEPT {
1930 #ifdef NLIB_F128_SIMD_NOUSE
1931  f128 ret;
1932  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1933  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1934  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1935  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1936  return ret;
1937 #elif defined(NLIB_SSE41)
1938  return _mm_div_ps(F128::SetOne(), value);
1939 #elif defined(NLIB_NEON)
1940 #ifdef __aarch64__
1941  return vdivq_f32(vdupq_n_f32(1.f), value);
1942 #else
1943  float32x4_t x;
1944  x = vrecpeq_f32(value);
1945  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x1 = x0 * (2 - x0 * value)
1946  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x2 = x1 * (2 - x1 * value)
1947  uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1948  float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1949  return result;
1950 #endif
1951 #elif defined(CAFE)
1952  return F128::Div(F128::SetOne(), value);
1953 #endif
1954 }
1955 
1956 // r[i] = 1.f / value[i] with lower precision
1957 NLIB_M(f128) F128::RecpEst(f128arg value) NLIB_NOEXCEPT {
1958 #ifdef NLIB_F128_SIMD_NOUSE
1959  f128 ret;
1960  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1961  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1962  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1963  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1964  return ret;
1965 #elif defined(NLIB_SSE41)
1966  return _mm_rcp_ps(value);
1967 #elif defined(NLIB_NEON)
1968  return vrecpeq_f32(value);
1969 #elif defined(CAFE)
1970  f128 ret;
1971  ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1972  ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1973  return ret;
1974 #endif
1975 }
1976 
1977 // r[i] = sqrtf(value[i]) with higher precision
1978 NLIB_M2(f128) F128::Sqrt(f128arg value) NLIB_NOEXCEPT {
1979 #ifdef NLIB_F128_SIMD_NOUSE
1980  f128 ret;
1981  ret.vec.v[0] = sqrtf(value.vec.v[0]);
1982  ret.vec.v[1] = sqrtf(value.vec.v[1]);
1983  ret.vec.v[2] = sqrtf(value.vec.v[2]);
1984  ret.vec.v[3] = sqrtf(value.vec.v[3]);
1985  return ret;
1986 #elif defined(NLIB_SSE41)
1987  return _mm_sqrt_ps(value);
1988 #elif defined(NLIB_NEON)
1989  f128 iszero = F128::CmpEqZero(value);
1990  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1991  return F128::AndNot(iszero, result);
1992 #elif defined(CAFE)
1993  f128 zero = F128::SetZero();
1994  f128 iszero = F128::CmpEq(zero, value);
1995  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1996  return F128::Select(iszero, zero, result);
1997 #endif
1998 }
1999 
2000 // r[i] = sqrtf(value[i]) with lower precision
2001 NLIB_M(f128) F128::SqrtEst(f128arg value) NLIB_NOEXCEPT {
2002 #ifdef NLIB_F128_SIMD_NOUSE
2003  f128 ret;
2004  ret.vec.v[0] = sqrtf(value.vec.v[0]);
2005  ret.vec.v[1] = sqrtf(value.vec.v[1]);
2006  ret.vec.v[2] = sqrtf(value.vec.v[2]);
2007  ret.vec.v[3] = sqrtf(value.vec.v[3]);
2008  return ret;
2009 #elif defined(NLIB_SSE41)
2010  return _mm_sqrt_ps(value);
2011 #elif defined(NLIB_NEON)
2012  return vrecpeq_f32(vrsqrteq_f32(value));
2013 #elif defined(CAFE)
2014  f128 ret;
2015  ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2016  ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2017  return ret;
2018 #endif
2019 }
2020 
2021 // r[i] = sqrtf(1.f / value[i]) with higher precision
2022 NLIB_M2(f128) F128::RecpSqrt(f128arg value) NLIB_NOEXCEPT {
2023 #ifdef NLIB_F128_SIMD_NOUSE
2024  f128 ret;
2025  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2026  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2027  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2028  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2029  return ret;
2030 #elif defined(NLIB_SSE41)
2031  return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2032 #elif defined(NLIB_NEON)
2033  float32x4_t x;
2034  x = vrsqrteq_f32(value);
2035  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2036  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2037  f128 zeromask = F128::CmpEqZero(value);
2038  return F128::Select(zeromask, F128::SetInfinity(), x);
2039 #elif defined(CAFE)
2040  f32x2 three = __PS_FDUP(3.f);
2041  f32x2 half = __PS_FDUP(0.5f);
2042  f32x2 x;
2043  f32x2 xx;
2044  f32x2 v;
2045  f128 ret;
2046 
2047  v = value.vec.ps[0];
2048  x = __PS_RSQRTE(v);
2049 
2050  xx = __PS_MUL(x, x);
2051  xx = __PS_NMSUB(v, xx, three);
2052  xx = __PS_MUL(x, xx);
2053  x = __PS_MUL(half, xx);
2054 
2055  xx = __PS_MUL(x, x);
2056  xx = __PS_NMSUB(v, xx, three);
2057  xx = __PS_MUL(x, xx);
2058  ret.vec.ps[0] = __PS_MUL(half, xx);
2059 
2060  v = value.vec.ps[1];
2061  x = __PS_RSQRTE(v);
2062 
2063  xx = __PS_MUL(x, x);
2064  xx = __PS_NMSUB(v, xx, three);
2065  xx = __PS_MUL(x, xx);
2066  x = __PS_MUL(half, xx);
2067 
2068  xx = __PS_MUL(x, x);
2069  xx = __PS_NMSUB(v, xx, three);
2070  xx = __PS_MUL(x, xx);
2071  ret.vec.ps[1] = __PS_MUL(half, xx);
2072 
2073  f128 iszero = F128::CmpEq(F128::SetZero(), value);
2074  f128 inf = F128::SetInfinity();
2075  return F128::Select(iszero, inf, ret);
2076 #endif
2077 }
2078 
2079 // r[i] = sqrtf(1.f / value[i]) with lower precision
2080 NLIB_M(f128) F128::RecpSqrtEst(f128arg value) NLIB_NOEXCEPT {
2081 #ifdef NLIB_F128_SIMD_NOUSE
2082  f128 ret;
2083  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2084  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2085  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2086  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2087  return ret;
2088 #elif defined(NLIB_SSE41)
2089  return _mm_rsqrt_ps(value);
2090 #elif defined(NLIB_NEON)
2091  return vrsqrteq_f32(value);
2092 #elif defined(CAFE)
2093  f128 ret;
2094  ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2095  ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2096  return ret;
2097 #endif
2098 }
2099 
2100 template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
2101 NLIB_M(f128) F128::NegateEx(f128arg value) NLIB_NOEXCEPT {
2102  const size_t lane0 = NegateLane0 ? 4 : 0;
2103  const size_t lane1 = NegateLane1 ? 5 : 1;
2104  const size_t lane2 = NegateLane2 ? 6 : 2;
2105  const size_t lane3 = NegateLane3 ? 7 : 3;
2106  return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2107 }
2108 
2109 template <>
2110 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value) NLIB_NOEXCEPT {
2111  return value;
2112 }
2113 
2114 template <>
2115 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value) NLIB_NOEXCEPT {
2116  return F128::Negate(value);
2117 }
2118 
2119 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2120 #define NLIB_ISNAN(vec, idx) \
2121  ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0)
2122 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U)
2123 #endif
2124 
2125 // r[i] = isnan(value[i]) ? 0xFFFFFFFF : 0
2126 NLIB_M2(f128) F128::IsNaN(f128arg value) NLIB_NOEXCEPT {
2127 #if defined(NLIB_F128_SIMD_NOUSE)
2128  f128 ret;
2129  ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2130  ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2131  ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2132  ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2133  return ret;
2134 #elif defined(CAFE)
2135  // on CAFE, value is NaN if value < 0 && -value < 0
2136  f32x2 one = __PS_FDUP(1.f);
2137  f32x2 minus_one = __PS_NEG(one);
2138  f32x2 v0 = value.vec.ps[0];
2139  f32x2 v1 = value.vec.ps[1];
2140  f32x2 t0 = __PS_SEL(v0, one, minus_one);
2141  f32x2 t1 = __PS_SEL(v1, one, minus_one);
2142  f128 ret;
2143  f32x2 v0neg = __PS_NEG(v0);
2144  f32x2 v1neg = __PS_NEG(v1);
2145  ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2146  ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2147  return ret;
2148 #else
2149  return F128::CmpNe(value, value);
2150 #endif
2151 }
2152 
2153 // r[i] = isinf(value[i]) ? 0xFFFFFFFF : 0
2154 NLIB_M(f128) F128::IsInfinite(f128arg value) NLIB_NOEXCEPT {
2155 #if defined(NLIB_F128_SIMD_NOUSE)
2156  f128 ret;
2157  ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2158  ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2159  ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2160  ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2161  return ret;
2162 #elif defined(CAFE)
2163  f128 ret;
2164  f32x2 big_value = __PS_FDUP(FLT_MAX);
2165  ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2166  ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2167  return ret;
2168 #else
2169  f128 inf_value = F128::SetInfinity();
2170  f128 abs_value = F128::Abs(value);
2171  return F128::CmpEq(inf_value, abs_value);
2172 #endif
2173 }
2174 
2175 // for example, 6.54321 -> 7.0, -6.54321 -> -7.0
2176 NLIB_M(f128) F128::Round(f128arg value) NLIB_NOEXCEPT {
2177 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
2178  return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2179 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE)
2180  return vrndaq_f32(value);
2181 #else
2182  // Add and Sub a big value to round the number after the decimal point
2183  f128 sgn = F128::And(value, F128::SetSignMask());
2184  f128 sm = F128::Or(F128::SetValue(0x4B000000U, each_uint32), sgn);
2185  f128 result = F128::Sub(F128::Add(value, sm), sm);
2186  return result;
2187 #endif
2188 }
2189 
2190 // for example, 6.54321 -> 6.0, -6.54321 -> -6.0
2191 NLIB_M2(f128) F128::Truncate(f128arg value) NLIB_NOEXCEPT {
2192 // Note that there is no fraction if |value| > 2^23
2193 // 2^23 = 8388608
2194 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2195  f128 ret;
2196  for (size_t i = 0; i < 4; ++i) {
2197  if (NLIB_ISNAN(value.vec, i)) {
2198  ret.vec.u[i] = 0x7FC00000U;
2199  } else {
2200  ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2201  ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2202  : value.vec.v[i];
2203  }
2204  }
2205  return ret;
2206 #elif defined(NLIB_SSE41)
2207  return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2208 #elif defined(NLIB_NEON)
2209 # if __ARM_ARCH < 8
2210  f128 x = F128::Abs(value);
2211  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2212  f128 cond = F128::CmpLt(x, c_2_23);
2213  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2214  return F128::Select(cond, casted, value);
2215 # else
2216  return vrndq_f32(value);
2217 # endif
2218 #endif
2219 }
2220 
2221 // for example, 6.54321 -> 6.0, -6.54321 -> -7.0
2222 NLIB_M2(f128) F128::Floor(f128arg value) NLIB_NOEXCEPT {
2223 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2224  f128 ret;
2225  ret.vec.v[0] = floorf(value.vec.v[0]);
2226  ret.vec.v[1] = floorf(value.vec.v[1]);
2227  ret.vec.v[2] = floorf(value.vec.v[2]);
2228  ret.vec.v[3] = floorf(value.vec.v[3]);
2229  return ret;
2230 #elif defined(NLIB_SSE41)
2231  return _mm_floor_ps(value);
2232 #elif defined(NLIB_NEON)
2233 # if __ARM_ARCH < 8
2234  // Note that there is no fraction if |value| > 2^23
2235  // 2^23 = 8388608
2236  f128 x = F128::Abs(value);
2237  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2238  f128 cond = F128::CmpLt(x, c_2_23);
2239  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2240 
2241  // -1 if result is larger
2242  f128 large_mask = F128::CmpGt(casted, value);
2243  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2244  casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(large_mask)));
2245  return F128::Select(cond, casted, value);
2246 # else
2247  return vrndmq_f32(value);
2248 # endif
2249 #endif
2250 }
2251 
2252 // for example, 6.54321 -> 7.0, -6.54321 -> -6.0
2253 NLIB_M2(f128) F128::Ceil(f128arg value) NLIB_NOEXCEPT {
2254 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2255  f128 ret;
2256  ret.vec.v[0] = ceilf(value.vec.v[0]);
2257  ret.vec.v[1] = ceilf(value.vec.v[1]);
2258  ret.vec.v[2] = ceilf(value.vec.v[2]);
2259  ret.vec.v[3] = ceilf(value.vec.v[3]);
2260  return ret;
2261 #elif defined(NLIB_SSE41)
2262  return _mm_ceil_ps(value);
2263 #elif defined(NLIB_NEON)
2264 # if __ARM_ARCH < 8
2265  // Note that there is no fraction if |value| > 2^23
2266  // 2^23 = 8388608
2267  f128 x = F128::Abs(value);
2268  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2269  f128 cond = F128::CmpLt(x, c_2_23);
2270  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2271 
2272  // +1 if result is smaller
2273  f128 small_mask = F128::CmpLt(casted, value);
2274  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2275  casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(small_mask)));
2276  return F128::Select(cond, casted, value);
2277 # else
2278  return vrndpq_f32(value);
2279 # endif
2280 #endif
2281 }
2282 
2283 #ifdef NLIB_F128_SIMD_NOUSE
2284 #undef NLIB_ISNAN
2285 #undef NLIB_ISINF
2286 #endif
2287 
2288 // r[i] = clamp(value[i], { 0, 0, 0, 0 }, { 1, 1, 1, 1 })
2289 NLIB_M(f128) F128::Saturate(f128arg value) NLIB_NOEXCEPT {
2290  return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2291 }
2292 
2293 NLIB_M2(f128) F128::ModAngle(f128arg value) NLIB_NOEXCEPT {
2294  static const float v_1_2pi = 0.15915494309189535f;
2295  static const float v_2pi = 6.283185307179586f;
2296  // value - 2pi * round(value * (1/2pi)) to be [-pi, pi)
2297  const f128 recp_two_pi = F128::SetValue(v_1_2pi, each_float);
2298  f128 round = F128::Round(F128::Mult(value, recp_two_pi));
2299  const f128 two_pi = F128::SetValue(v_2pi, each_float);
2300  return F128::MultSub(two_pi, round, value);
2301 }
2302 
2303 NLIB_M2(f128) F128::Sin(f128arg value) NLIB_NOEXCEPT {
2304  // within [-pi, pi)
2305  f128 x = F128::ModAngle(value);
2306 
2307  // within [-pi/2, pi/2]
2308  // use sin(x) == sin(pi - x), sin(x) == sin(-pi - x)
2309  // |x| <= pi/2 -> x = x
2310  // x > pi/2 -> x = pi - x
2311  // x < -pi/2 -> x = -pi - x
2312  f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2313  f128 pi = F128::SetValue<0>(sin_cvalue, each_select32);
2314  f128 pidiv2 = F128::SetValue<1>(sin_cvalue, each_select32);
2315 
2316  f128 xabs = F128::Abs(value);
2317  f128 xsign = F128::And(F128::SetSignMask(), x);
2318  f128 mypi = F128::Or(xsign, pi);
2319  f128 pi_x = F128::Sub(mypi, x);
2320  f128 cond = F128::CmpLe(xabs, pidiv2);
2321  x = F128::Select(cond, x, pi_x);
2322 
2323  f128 xx = F128::Mult(x, x);
2324  f128 coeff = F128::LoadA16(sin_coeff_);
2325  f128 result;
2326  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2327 
2328  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2329  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2330  result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue, each_select32));
2331  result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue, each_select32));
2332  result = F128::Mult(xx, result);
2333  result = F128::MultSub(result, x, x);
2334  return result;
2335 }
2336 
2337 NLIB_M2(f128) F128::Cos(f128arg value) NLIB_NOEXCEPT {
2338  // within [-pi, pi)
2339  f128 x = F128::ModAngle(value);
2340 
2341  // within [-pi/2, pi/2]
2342  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2343  // |x| <= pi/2 -> x = x
2344  // x > pi/2 -> x = pi - x
2345  // x < -pi/2 -> x = -pi - x
2346  f128 cvalue = F128::LoadA16(cos_cvalue_);
2347 
2348  f128 xabs = F128::Abs(value);
2349  f128 xsign = F128::And(F128::SetSignMask(), x);
2350  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2351  f128 pi_x = F128::Sub(mypi, x);
2352  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2353  x = F128::Select(cond, x, pi_x);
2354 
2355  // +1 if [-pi/2, pi/2], -1 otherwise
2356  f128 sign = F128::AndNot(cond, F128::SetSignMask());
2357 
2358  // xx = x^2
2359  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2360  f128 xx = F128::Mult(x, x);
2361  f128 coeff = F128::LoadA16(cos_coeff_);
2362  f128 result;
2363  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2364 
2365  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2366  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2367  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2368  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2369  result = F128::MultSub(xx, result, F128::SetOne());
2370  result = F128::Xor(sign, result);
2371  return result;
2372 }
2373 
2374 NLIB_M2(f128x2) F128::SinCos(f128arg value) NLIB_NOEXCEPT {
2375  // within [-pi, pi)
2376  const f128 signmask = F128::SetSignMask();
2377  f128 x = F128::ModAngle(value);
2378 
2379  // within [-pi/2, pi/2]
2380  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2381  // |x| <= pi/2 -> x = x
2382  // x > pi/2 -> x = pi - x
2383  // x < -pi/2 -> x = -pi - x
2384  f128 cvalue = F128::LoadA16(cos_cvalue_);
2385 
2386  f128 xabs = F128::Abs(value);
2387  f128 xsign = F128::And(signmask, x);
2388  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2389  f128 pi_x = F128::Sub(mypi, x);
2390  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2391  x = F128::Select(cond, x, pi_x);
2392 
2393  // +1 if [-pi/2, pi/2], -1 otherwise
2394  f128 sign = F128::AndNot(cond, signmask);
2395 
2396  // xx = x^2
2397  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2398  f128 xx = F128::Mult(x, x);
2399  f128x2 ret;
2400 
2401  // cos
2402  {
2403  f128 coeff = F128::LoadA16(cos_coeff_);
2404  f128 result;
2405  result =
2406  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2407 
2408  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2409  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2410  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2411  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2412  result = F128::MultSub(xx, result, F128::SetOne());
2413 
2414  ret.val[1] = F128::Xor(sign, result); // cos
2415  }
2416 
2417  // sin
2418  {
2419  f128 coeff = F128::LoadA16(sin_coeff_);
2420  f128 result;
2421  result =
2422  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2423 
2424  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2425  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2426  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2], each_float));
2427  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3], each_float));
2428  result = F128::Mult(xx, result);
2429  ret.val[0] = F128::MultSub(result, x, x); // sin
2430  }
2431  return ret;
2432 }
2433 
2434 NLIB_M2(f128) F128::ArcTan(f128arg value) NLIB_NOEXCEPT {
2435  // |value| <= 1 -> atan(value)
2436  // value > 1 -> pi/2 - atan(1/value)
2437  // value < -1 -> -pi/2 - atan(1/value)
2438  f128 cmp, value_sign;
2439  {
2440  f128 one = F128::SetOne();
2441 
2442  // value_sign:
2443  // 1 if value > 1,
2444  // -1 if value < -1
2445  value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2446  cmp = F128::CmpLe(F128::Abs(value), one);
2447  }
2448  f128 x = F128::Select(cmp, value, F128::Recp(value));
2449 
2450  // atan(x) = x - 1/3 * x^3 + ... + (-1)^n/(2n+1) * x^(2n+1)
2451  // = x(1 - xx(1/3 - xx(1/5 - xx(1/7 - xx(1/9 - xx(1/11 - xx(1/13 - xx(1/15 - xx(1/17)...)
2452  // NOTE:
2453  // DO NOT USE TAYLOR SERIES
2454  // minmax approximation(the output of Remez algorithm)
2455  f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2456  f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2457  f128 xx = F128::Mult(x, x);
2458  f128 result;
2459  result = F128::MultSub<3>(coeff1, xx, F128::SetValue<2>(coeff1, each_select32), each_select32);
2460  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1, each_select32));
2461  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1, each_select32));
2462  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0, each_select32));
2463  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0, each_select32));
2464  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0, each_select32));
2465  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0, each_select32));
2466 
2467  result = F128::Mult(result, x);
2468  result = F128::MultSub(xx, result, x);
2469 
2470  f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2471  f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2472  result = F128::Select(cmp, result, result_another);
2473  return result;
2474 }
2475 
2476 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT {
2477  // y / x -> value
2478  // 0 / - -> pi | sign(y)
2479  // 0 / + -> 0 | sign(y)
2480  // y / Inf -> 0 | sign(y)
2481  // y / -Inf -> pi | sign(y)
2482  // y!=0 / 0 -> pi/2 | sign(y)
2483  // y!=0 / - -> atan(y/x) + pi | sign(y)
2484  // Inf / x -> pi/2 | sign(y)
2485  // +-Inf / Inf -> pi/4 | sign(y)
2486  // +-Inf / -Inf -> 3pi/4 | sign(y)
2487  // otherwise -> atan(y/x)
2488 
2489  // sx = sign(x), sy = sign(y)
2490  // infx = isinf(x), infy = isinf(y)
2491  // zerox = iszero(x), zeroy = iszero(y)
2492  // posx = x > 0
2493  const f128 signmask = F128::SetSignMask();
2494  // const f128 sx = F128::And(x, signmask);
2495  const f128 sy = F128::And(y, signmask);
2496  const f128 infx = F128::IsInfinite(x);
2497  const f128 infy = F128::IsInfinite(y);
2498  const f128 zerox = F128::CmpEqZero(x);
2499  const f128 zeroy = F128::CmpEqZero(y);
2500  const f128 posx = F128::CmpGtZero(x);
2501 
2502  // v =
2503  // infy ?
2504  // infx ?
2505  // posx ? (pi/4 | sy) : (3pi/4 | sy)
2506  // : (pi/2 | sy)
2507  // : zeroy ?
2508  // posx ? (0 | sy) : (pi | sy)
2509  // : zerox ? (pi/2 | sy) : TrueMask;
2510  const f128 cval = F128::LoadA16(atan2_cvalue_);
2511  const f128 pi = F128::Or(sy, F128::SetValue<0>(cval, each_select32));
2512  const f128 pi_34 = F128::Or(sy, F128::SetValue<1>(cval, each_select32));
2513  const f128 pi_2 = F128::Or(sy, F128::SetValue<2>(cval, each_select32));
2514  const f128 pi_4 = F128::Or(sy, F128::SetValue<3>(cval, each_select32));
2515 
2516  f128 v = F128::Select(
2517  infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2518  F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2519 
2520 // mask = EqInt(v, full);
2521 // result = Atan(y/x) + (posx ? (0 | sy) : (pi | sy))
2522 // return mask ? result : v;
2523 #if defined(NLIB_F128_SIMD_NOUSE)
2524  f128 mask;
2525  mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2526  mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2527  mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2528  mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2529 #elif defined(CAFE)
2530  // select makes 0xFFFFFFFFUL -> 0xFF7FFFFFUL
2531  f128 mask;
2532  mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2533  mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2534  mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2535  mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2536 #else
2537  f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v),
2538  I128::SetValue(-1, each_int8)));
2539 #endif
2540  f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2541  return F128::Select(mask, result, v);
2542 }
2543 
2544 NLIB_M2(f128) F128::ArcSin(f128arg value) NLIB_NOEXCEPT {
2545  // asin(x) = atan2 (x, sqrt ((1.0 + x) * (1.0 - x)))
2546  f128 one = F128::SetOne();
2547  f128 tmp = F128::MultSub(value, value, one);
2548  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2549  return F128::ArcTan2(value, argx);
2550 }
2551 
2552 NLIB_M2(f128) F128::ArcCos(f128arg value) NLIB_NOEXCEPT {
2553  // acos(x) = atan2 (sqrt ((1.0 + x) * (1.0 - x)), x)
2554  f128 one = F128::SetOne();
2555  f128 tmp = F128::MultSub(value, value, one);
2556  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2557  return F128::ArcTan2(argx, value);
2558 }
2559 
2560 // see _mm_movemask_ps of SSE
2561 NLIB_M2(int) F128::MoveMask(f128arg value) NLIB_NOEXCEPT { // NOLINT
2562 #ifdef NLIB_F128_SIMD_NOUSE
2563  uint8_t ret = 0;
2564  ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2565  ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2566  ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2567  ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2568  return ret;
2569 #elif defined(NLIB_SSE41)
2570  return static_cast<uint8_t>(_mm_movemask_ps(value));
2571 #elif defined(NLIB_NEON)
2572  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2573  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2574  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2575  uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2576 # ifdef __aarch64__
2577  return vaddvq_u32(a);
2578 # else
2579  uint16x4_t tmp = vmovn_u32(a);
2580  tmp = vpadd_u16(tmp, tmp);
2581  tmp = vpadd_u16(tmp, tmp);
2582  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2583 # endif
2584 #elif defined(CAFE)
2585  int tmp = (value.vec.u[0] >> 31);
2586  tmp |= (value.vec.u[1] >> 30) & 2;
2587  tmp |= (value.vec.u[2] >> 29) & 4;
2588  tmp |= (value.vec.u[3] >> 28) & 8;
2589  return tmp;
2590 #endif
2591 }
2592 
2593 // true if value[i] == 0 for all i
2594 NLIB_M2(bool) F128::IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT { // NOLINT
2595 #ifdef NLIB_F128_SIMD_NOUSE
2596  return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2597 #elif defined(NLIB_SSE41)
2598  i128 casted = F128::CastToI128(value);
2599  return _mm_testz_si128(casted, casted) != 0;
2600 #elif defined(NLIB_NEON)
2601 # ifdef __aarch64__
2602  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2603  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2604 # else
2605  int32x4_t casted = vreinterpretq_s32_f32(value);
2606  int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2607  return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2608 # endif
2609 #elif defined(CAFE)
2610  uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2611  return (tmp & 0x80000000U) == 0;
2612 #endif
2613 }
2614 
2615 // true if value[i] == 0xFFFFFFFF for all i
2616 NLIB_M2(bool) F128::IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT { // NOLINT
2617 #ifdef NLIB_F128_SIMD_NOUSE
2618  return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2619  value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2620 #elif defined(NLIB_SSE41)
2621  i128 casted = F128::CastToI128(value);
2622  return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2623 #elif defined(NLIB_NEON)
2624 # ifdef __aarch64__
2625  uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2626  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2627 # else
2628  int32x4_t casted = vreinterpretq_s32_f32(value);
2629  int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2630  return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2631 # endif
2632 #elif defined(CAFE)
2633  uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2634  return (tmp & 0x80000000U) != 0;
2635 #endif
2636 }
2637 
2638 template <size_t N>
2639 // r = value[N]
2640 NLIB_M(float) F128::GetFloatFromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2641  NLIB_STATIC_ASSERT(N < 4);
2642 #ifdef NLIB_F128_SIMD_NOUSE
2643  return value.vec.v[N];
2644 #elif defined(NLIB_SSE41)
2645  float dest;
2646  _MM_EXTRACT_FLOAT(dest, value, N);
2647  return dest;
2648 #elif defined(NLIB_NEON)
2649  return vgetq_lane_f32(value, N);
2650 #elif defined(CAFE)
2651  return value.vec.ps[N / 2][N % 2];
2652 #endif
2653 }
2654 
2655 template <size_t N>
2656 // r = *reinterpret_cast<uint32_t*>(&value[N])
2657 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2658  NLIB_STATIC_ASSERT(N < 4);
2659 #ifdef NLIB_F128_SIMD_NOUSE
2660  return value.vec.u[N];
2661 #elif defined(NLIB_SSE41)
2662  return _mm_extract_ps(value, N);
2663 #elif defined(NLIB_NEON)
2664  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2665  return vgetq_lane_u32(tmp, N);
2666 #elif defined(CAFE)
2667  return value.vec.u[N];
2668 #endif
2669 }
2670 
2671 // r = value[idx]
2672 NLIB_M2(float) F128::GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT { // NOLINT
2673 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2674  return value.vec.v[idx];
2675 #elif defined(NLIB_SSE41)
2676  float dest;
2677  switch (idx) {
2678  case 0:
2679  _MM_EXTRACT_FLOAT(dest, value, 0);
2680  break;
2681  case 1:
2682  _MM_EXTRACT_FLOAT(dest, value, 1);
2683  break;
2684  case 2:
2685  _MM_EXTRACT_FLOAT(dest, value, 2);
2686  break;
2687  case 3:
2688  _MM_EXTRACT_FLOAT(dest, value, 3);
2689  break;
2690  default:
2691  NLIB_ASSUME(0);
2692  break;
2693  }
2694  return dest;
2695 #elif defined(NLIB_NEON)
2696  switch (idx) {
2697  case 0:
2698  return vgetq_lane_f32(value, 0);
2699  case 1:
2700  return vgetq_lane_f32(value, 1);
2701  case 2:
2702  return vgetq_lane_f32(value, 2);
2703  case 3:
2704  return vgetq_lane_f32(value, 3);
2705  default:
2706  NLIB_ASSUME(0);
2707  break;
2708  }
2709 #endif
2710 }
2711 
2712 // r = *reinterpret_cast<uint32_t*>(&value[idx])
2713 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT {
2714 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2715  return value.vec.u[idx];
2716 #elif defined(NLIB_SSE41)
2717  switch (idx) {
2718  case 0:
2719  return static_cast<uint32_t>(_mm_extract_ps(value, 0));
2720  case 1:
2721  return static_cast<uint32_t>(_mm_extract_ps(value, 1));
2722  case 2:
2723  return static_cast<uint32_t>(_mm_extract_ps(value, 2));
2724  case 3:
2725  return static_cast<uint32_t>(_mm_extract_ps(value, 3));
2726  default:
2727  NLIB_ASSUME(0);
2728  break;
2729  }
2730 #elif defined(NLIB_NEON)
2731  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2732  switch (idx) {
2733  case 0:
2734  return vgetq_lane_u32(tmp, 0);
2735  case 1:
2736  return vgetq_lane_u32(tmp, 1);
2737  case 2:
2738  return vgetq_lane_u32(tmp, 2);
2739  case 3:
2740  return vgetq_lane_u32(tmp, 3);
2741  default:
2742  NLIB_ASSUME(0);
2743  break;
2744  }
2745 #endif
2746 }
2747 
2748 template <size_t N>
2749 // r = value, r[N] = v
2750 NLIB_M(f128) F128::SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT { // NOLINT
2751  NLIB_STATIC_ASSERT(N < 4);
2752 #ifdef NLIB_F128_SIMD_NOUSE
2753  f128 ret = value;
2754  ret.vec.v[N] = v;
2755  return ret;
2756 #elif defined(NLIB_SSE41)
2757  f128 tmp = _mm_set_ss(v);
2758  return _mm_insert_ps(value, tmp, N << 4);
2759 #elif defined(NLIB_NEON)
2760  return __builtin_constant_p(v) ?
2761  F128::Permute<N == 0 ? 4 : 0,
2762  N == 1 ? 5 : 1,
2763  N == 2 ? 6 : 2,
2764  N == 3 ? 7 : 3>(value, vdupq_n_f32(v)) :
2765  vsetq_lane_f32(v, value, N);
2766 #elif defined(CAFE)
2767  f128 ret = value;
2768  ret.vec.ps[N / 2][N % 2] = v;
2769  return ret;
2770 #endif
2771 }
2772 
2773 // r = value, r[i] = v
2774 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT {
2775 #ifdef NLIB_F128_SIMD_NOUSE
2776  f128 ret = value;
2777  ret.vec.v[i] = v;
2778  return ret;
2779 #elif defined(NLIB_SSE41)
2780  f128 tmp = _mm_set_ss(v);
2781  switch (i) {
2782  case 0:
2783  return _mm_insert_ps(value, tmp, 0x00);
2784  case 1:
2785  return _mm_insert_ps(value, tmp, 0x10);
2786  case 2:
2787  return _mm_insert_ps(value, tmp, 0x20);
2788  case 3:
2789  return _mm_insert_ps(value, tmp, 0x30);
2790  default:
2791  NLIB_ASSUME(0);
2792  break;
2793  }
2794 #elif defined(NLIB_NEON)
2795  switch (i) {
2796  case 0:
2797  return F128::SetFloatToLane<0>(value, v);
2798  case 1:
2799  return F128::SetFloatToLane<1>(value, v);
2800  case 2:
2801  return F128::SetFloatToLane<2>(value, v);
2802  case 3:
2803  return F128::SetFloatToLane<3>(value, v);
2804  default:
2805  NLIB_ASSUME(0);
2806  break;
2807  }
2808 #elif defined(CAFE)
2809  f128 ret = value;
2810  switch (i) {
2811  case 0:
2812  ret.vec.ps[0][0] = v;
2813  break;
2814  case 1:
2815  ret.vec.ps[0][1] = v;
2816  break;
2817  case 2:
2818  ret.vec.ps[1][0] = v;
2819  break;
2820  default:
2821  ret.vec.ps[1][1] = v;
2822  break;
2823  }
2824  return ret;
2825 #endif
2826 }
2827 
2828 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
2829 namespace detail {
2830 
2831 template <bool IsHighA, bool IsHighB>
2832 float32x2_t F64Merge(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT;
2833 
2834 template <>
2835 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2836 #ifdef __aarch64__
2837  return vtrn1_f32(a, b);
2838 #else
2839  return vtrn_f32(a, b).val[0];
2840 #endif
2841 };
2842 
2843 template <>
2844 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2845 #ifdef __aarch64__
2846  return vtrn1_f32(vrev64_f32(a), b);
2847 #else
2848  return vtrn_f32(vrev64_f32(a), b).val[0];
2849 #endif
2850 };
2851 
2852 template <>
2853 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2854 #ifdef __aarch64__
2855  return vtrn1_f32(a, vrev64_f32(b));
2856 #else
2857  return vtrn_f32(a, vrev64_f32(b)).val[0];
2858 #endif
2859 };
2860 
2861 template <>
2862 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2863 #ifdef __aarch64__
2864  return vtrn2_f32(a, b);
2865 #else
2866  return vtrn_f32(a, b).val[1];
2867 #endif
2868 };
2869 
2870 template <int Z>
2871 float32x2_t F128SwizzleGet64(f128arg value) NLIB_NOEXCEPT;
2872 
2873 template <>
2874 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<0>(f128arg value) NLIB_NOEXCEPT {
2875  return vget_low_f32(value);
2876 }
2877 
2878 template <>
2879 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<1>(f128arg value) NLIB_NOEXCEPT {
2880  return vget_high_f32(value);
2881 }
2882 
2883 template <int X0, int X1>
2884 struct F128SwizzleHelper2 {
2885  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2886  float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2887  float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2888  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2889  }
2890 };
2891 
2892 template <int X>
2893 struct F128SwizzleHelper2<X, X> {
2894  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2895  float32x2_t x = F128SwizzleGet64<X / 2>(value);
2896  return vdup_lane_f32(x, (X & 1));
2897  }
2898 };
2899 
2900 template <>
2901 struct F128SwizzleHelper2<0, 1> {
2902  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2903  return vget_low_f32(value);
2904  }
2905 };
2906 
2907 template <>
2908 struct F128SwizzleHelper2<0, 2> {
2909  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2910 #ifdef __aarch64__
2911  return vget_low_f32(vuzp1q_f32(value, value));
2912 #else
2913  float32x2_t lo = vget_low_f32(value);
2914  float32x2_t hi = vget_high_f32(value);
2915  return vzip_f32(lo, hi).val[0];
2916 #endif
2917  }
2918 };
2919 
2920 template <>
2921 struct F128SwizzleHelper2<0, 3> {
2922  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2923  float32x2_t lo = vget_low_f32(value);
2924  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2925 #ifdef __aarch64__
2926  return vzip1_f32(lo, hi);
2927 #else
2928  return vzip_f32(lo, hi).val[0];
2929 #endif
2930  }
2931 };
2932 
2933 template <>
2934 struct F128SwizzleHelper2<1, 0> {
2935  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2936  return vrev64_f32(vget_low_f32(value));
2937  }
2938 };
2939 
2940 template <>
2941 struct F128SwizzleHelper2<1, 2> {
2942  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2943  float32x2_t lo = vget_low_f32(value);
2944  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2945 #ifdef __aarch64__
2946  return vzip2_f32(lo, hi);
2947 #else
2948  return vzip_f32(lo, hi).val[1];
2949 #endif
2950  }
2951 };
2952 
2953 template <>
2954 struct F128SwizzleHelper2<1, 3> {
2955  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2956 #ifdef __aarch64__
2957  return vget_low_f32(vuzp2q_f32(value, value));
2958 #else
2959  float32x2_t lo = vget_low_f32(value);
2960  float32x2_t hi = vget_high_f32(value);
2961  return vzip_f32(lo, hi).val[1];
2962 #endif
2963  }
2964 };
2965 
2966 template <>
2967 struct F128SwizzleHelper2<2, 0> {
2968  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2969 #ifdef __aarch64__
2970  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2971 #else
2972  float32x2_t lo = vget_low_f32(value);
2973  float32x2_t hi = vget_high_f32(value);
2974  return vzip_f32(hi, lo).val[0];
2975 #endif
2976  }
2977 };
2978 
2979 template <>
2980 struct F128SwizzleHelper2<2, 1> {
2981  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2982 #ifdef __aarch64__
2983  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2984 #else
2985  float32x2_t lo = vget_low_f32(value);
2986  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2987  return vzip_f32(hi, lo).val[1];
2988 #endif
2989  }
2990 };
2991 
2992 template <>
2993 struct F128SwizzleHelper2<2, 3> {
2994  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2995  return vget_high_f32(value);
2996  }
2997 };
2998 
2999 template <>
3000 struct F128SwizzleHelper2<3, 0> {
3001  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3002  float32x2_t lo = vget_low_f32(value);
3003  float32x2_t hi = vrev64_f32(vget_high_f32(value));
3004 #ifdef __aarch64__
3005  return vzip1_f32(hi, lo);
3006 #else
3007  return vzip_f32(hi, lo).val[0];
3008 #endif
3009  }
3010 };
3011 
3012 template <>
3013 struct F128SwizzleHelper2<3, 1> {
3014  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3015  float32x2_t lo = vget_low_f32(value);
3016  float32x2_t hi = vget_high_f32(value);
3017 #ifdef __aarch64__
3018  return vzip2_f32(hi, lo);
3019 #else
3020  return vzip_f32(hi, lo).val[1];
3021 #endif
3022  }
3023 };
3024 
3025 template <>
3026 struct F128SwizzleHelper2<3, 2> {
3027  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3028  return vrev64_f32(vget_high_f32(value));
3029  }
3030 };
3031 
3032 template <int V0, int V1, int V2, int V3>
3033 struct F128SwizzleHelper {
3034  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3035  return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3036  detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3037  }
3038 };
3039 
3040 template <int Vx, int Vy>
3041 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3042  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3043  float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3044  return vcombine_f32(tmp, tmp);
3045  }
3046 };
3047 
3048 template <int V>
3049 struct F128SwizzleHelper<V, V, V, V> {
3050  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3051  return F128::SetValue<V>(value, each_select32);
3052  }
3053 };
3054 
3055 } // namespace detail
3056 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3057 namespace detail {
3058 
3059 template <int X0, int X1>
3060 struct F128SwizzleHelper {
3061  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT;
3062 };
3063 
3064 template<>
3065 struct F128SwizzleHelper<0, 0> {
3066  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3067  (void)v1;
3068  return __PS_MERGE00(v0, v0);
3069  }
3070 };
3071 
3072 template<>
3073 struct F128SwizzleHelper<0, 1> {
3074  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3075  (void)v1;
3076  return v0;
3077  }
3078 };
3079 
3080 template<>
3081 struct F128SwizzleHelper<0, 2> {
3082  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3083  return __PS_MERGE00(v0, v1);
3084  }
3085 };
3086 
3087 template<>
3088 struct F128SwizzleHelper<0, 3> {
3089  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3090  return __PS_MERGE01(v0, v1);
3091  }
3092 };
3093 
3094 template<>
3095 struct F128SwizzleHelper<1, 0> {
3096  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3097  (void)v1;
3098  return __PS_MERGE10(v0, v0);
3099  }
3100 };
3101 
3102 template<>
3103 struct F128SwizzleHelper<1, 1> {
3104  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3105  (void)v1;
3106  return __PS_MERGE11(v0, v0);
3107  }
3108 };
3109 
3110 template<>
3111 struct F128SwizzleHelper<1, 2> {
3112  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3113  return __PS_MERGE10(v0, v1);
3114  }
3115 };
3116 
3117 template<>
3118 struct F128SwizzleHelper<1, 3> {
3119  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3120  return __PS_MERGE11(v0, v1);
3121  }
3122 };
3123 
3124 template<>
3125 struct F128SwizzleHelper<2, 0> {
3126  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3127  return __PS_MERGE00(v1, v0);
3128  }
3129 };
3130 
3131 template<>
3132 struct F128SwizzleHelper<2, 1> {
3133  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3134  return __PS_MERGE01(v1, v0);
3135  }
3136 };
3137 
3138 template<>
3139 struct F128SwizzleHelper<2, 2> {
3140  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3141  (void)v0;
3142  return __PS_MERGE00(v1, v1);
3143  }
3144 };
3145 
3146 template<>
3147 struct F128SwizzleHelper<2, 3> {
3148  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3149  (void)v0;
3150  return v1;
3151  }
3152 };
3153 
3154 template<>
3155 struct F128SwizzleHelper<3, 0> {
3156  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3157  return __PS_MERGE10(v1, v0);
3158  }
3159 };
3160 
3161 template<>
3162 struct F128SwizzleHelper<3, 1> {
3163  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3164  return __PS_MERGE11(v1, v0);
3165  }
3166 };
3167 
3168 template<>
3169 struct F128SwizzleHelper<3, 2> {
3170  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3171  (void)v0;
3172  return __PS_MERGE10(v1, v1);
3173  }
3174 };
3175 
3176 template<>
3177 struct F128SwizzleHelper<3, 3> {
3178  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3179  (void)v0;
3180  return __PS_MERGE11(v1, v1);
3181  }
3182 };
3183 
3184 } // namespace detail
3185 #endif
3186 
3187 template <int V0, int V1, int V2, int V3>
3188 // r[0] = value[V0], r[1] = value[V1], r[2] = value[V2], r[3] = value[V3]
3189 NLIB_M(f128) F128::Swizzle(f128arg value) NLIB_NOEXCEPT {
3190  NLIB_STATIC_ASSERT(V0 < 4);
3191  NLIB_STATIC_ASSERT(V1 < 4);
3192  NLIB_STATIC_ASSERT(V2 < 4);
3193  NLIB_STATIC_ASSERT(V3 < 4);
3194 #if defined(NLIB_F128_SIMD_NOUSE)
3195  f128 ret;
3196  ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3197  ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3198  ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3199  ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3200  return ret;
3201 #elif __has_builtin(__builtin_shufflevector)
3202  return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3203 #elif defined(NLIB_SSE41)
3204  return _mm_shuffle_ps(value, value,
3205  _MM_SHUFFLE(V3 != -1 ? V3 : 3,
3206  V2 != -1 ? V2 : 2,
3207  V1 != -1 ? V1 : 1,
3208  V0 != -1 ? V0 : 0));
3209 #elif defined(NLIB_NEON)
3210  return detail::F128SwizzleHelper<
3211  V0 != -1 ? V0 : 0,
3212  V1 != -1 ? V1 : 1,
3213  V2 != -1 ? V2 : 2,
3214  V3 != -1 ? V3 : 3>::Swizzle(value);
3215 #elif defined(CAFE)
3216  f128 ret;
3217  ret.vec.ps[0] = detail::F128SwizzleHelper<
3218  (V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3219  ret.vec.ps[1] = detail::F128SwizzleHelper<
3220  (V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3221  return ret;
3222 #endif
3223 }
3224 
3225 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3226 // Swizzle specialization for NEON
3227 template <>
3228 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value) NLIB_NOEXCEPT {
3229 #ifdef __aarch64__
3230  return vzip1q_f32(value, value);
3231 #else
3232  return vzipq_f32(value, value).val[0];
3233 #endif
3234 }
3235 template <>
3236 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value) NLIB_NOEXCEPT {
3237 #ifdef __aarch64__
3238  return vtrn1q_f32(value, value);
3239 #else
3240  return vtrnq_f32(value, value).val[0];
3241 #endif
3242 }
3243 template <>
3244 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value) NLIB_NOEXCEPT {
3245  return value;
3246 }
3247 template <>
3248 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value) NLIB_NOEXCEPT {
3249 #ifdef __aarch64__
3250  return vuzp1q_f32(value, value);
3251 #else
3252  return vuzpq_f32(value, value).val[0];
3253 #endif
3254 }
3255 template <>
3256 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value) NLIB_NOEXCEPT {
3257  return vrev64q_f32(value);
3258 }
3259 template <>
3260 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3261 #ifdef __aarch64__
3262  return vtrn2q_f32(value, value);
3263 #else
3264  return vtrnq_f32(value, value).val[1];
3265 #endif
3266 }
3267 template <>
3268 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value) NLIB_NOEXCEPT {
3269  uint32x4_t ival = vreinterpretq_u32_f32(value);
3270  uint32x4_t rotated = vextq_u32(ival, ival, 1);
3271  return vreinterpretq_f32_u32(rotated);
3272 }
3273 template <>
3274 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value) NLIB_NOEXCEPT {
3275 #ifdef __aarch64__
3276  return vuzp2q_f32(value, value);
3277 #else
3278  return vuzpq_f32(value, value).val[1];
3279 #endif
3280 }
3281 template <>
3282 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3283 #ifdef __aarch64__
3284  return vzip2q_f32(value, value);
3285 #else
3286  return vzipq_f32(value, value).val[1];
3287 #endif
3288 }
3289 template <>
3290 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value) NLIB_NOEXCEPT {
3291  uint32x4_t ival = vreinterpretq_u32_f32(value);
3292  uint32x4_t rotated = vextq_u32(ival, ival, 2);
3293  return vreinterpretq_f32_u32(rotated);
3294 }
3295 template <>
3296 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value) NLIB_NOEXCEPT {
3297  uint32x4_t ival = vreinterpretq_u32_f32(value);
3298  uint32x4_t rotated = vextq_u32(ival, ival, 3);
3299  return vreinterpretq_f32_u32(rotated);
3300 }
3301 #endif
3302 
3303 namespace detail {
3304 
3305 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3306 template <bool UseBlend, bool UseShuffle, int V0, int V1, int V2, int V3>
3307 struct F128PermuteHelper2 {
3308  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3309  f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3310  f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3311  return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3312  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3313  }
3314 };
3315 
3316 template <bool UseShuffle, int V0, int V1, int V2, int V3>
3317 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3318  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3319  return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3320  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3321  }
3322 };
3323 
3324 template <int V0, int V1, int V2, int V3>
3325 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3326  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3327  return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3328  _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3329  }
3330 };
3331 
3332 template <>
3333 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3334  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3335  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3336  return _mm_castsi128_ps(tmp);
3337  }
3338 };
3339 
3340 template <>
3341 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3342  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3343  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3344  return _mm_castsi128_ps(tmp);
3345  }
3346 };
3347 
3348 template <>
3349 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3350  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3351  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3352  return _mm_castsi128_ps(tmp);
3353  }
3354 };
3355 
3356 template <int V>
3357 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3358  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3359  NLIB_STATIC_ASSERT(V > 3);
3360  return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3361  }
3362 };
3363 
3364 template <int V>
3365 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3366  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3367  NLIB_STATIC_ASSERT(V > 3);
3368  return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3369  }
3370 };
3371 
3372 template <int V>
3373 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3374  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3375  NLIB_STATIC_ASSERT(V > 3);
3376  return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3377  }
3378 };
3379 
3380 template <int V>
3381 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3382  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3383  NLIB_STATIC_ASSERT(V > 3);
3384  return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3385  }
3386 };
3387 
3388 template <int V>
3389 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3390  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3391  NLIB_STATIC_ASSERT(V < 4);
3392  return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3393  }
3394 };
3395 
3396 template <int V>
3397 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3398  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3399  NLIB_STATIC_ASSERT(V < 4);
3400  return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3401  }
3402 };
3403 
3404 template <int V>
3405 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3406  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3407  NLIB_STATIC_ASSERT(V < 4);
3408  return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3409  }
3410 };
3411 
3412 template <int V>
3413 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3414  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3415  NLIB_STATIC_ASSERT(V < 4);
3416  return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3417  }
3418 };
3419 
3420 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3421 struct F128PermuteHelper {
3422  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3423  return F128PermuteHelper2<
3424  ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3425  ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3426  V0, V1, V2, V3>::Permute(a, b);
3427  }
3428 };
3429 
3430 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3431 
3432 template <int Z>
3433 float32x2_t F128PermuteGet64(f128arg a, f128arg b) NLIB_NOEXCEPT;
3434 
3435 template <>
3436 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3437  NLIB_UNUSED(b);
3438  return vget_low_f32(a);
3439 }
3440 template <>
3441 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3442  NLIB_UNUSED(b);
3443  return vget_high_f32(a);
3444 }
3445 template <>
3446 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3447  NLIB_UNUSED(a);
3448  return vget_low_f32(b);
3449 }
3450 template <>
3451 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3452  NLIB_UNUSED(a);
3453  return vget_high_f32(b);
3454 }
3455 
3456 template <int X0, int X1>
3457 struct F128PermuteHelper2 {
3458  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3459  float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3460  float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3461  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3462  }
3463 };
3464 
3465 template <int X>
3466 struct F128PermuteHelper2<X, X> {
3467  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3468  float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3469  return vdup_lane_f32(x, (X & 1));
3470  }
3471 };
3472 
3473 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3474 struct F128PermuteHelper {
3475  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3476  return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3477  F128PermuteHelper2<V2, V3>::Permute(a, b));
3478  }
3479 };
3480 
3481 template <>
3482 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3483  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3484  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3485  return vreinterpretq_f32_s32(tmp);
3486  }
3487 };
3488 
3489 template <>
3490 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3491  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3492  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3493  return vreinterpretq_f32_s32(tmp);
3494  }
3495 };
3496 
3497 template <>
3498 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3499  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3500  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3501  return vreinterpretq_f32_s32(tmp);
3502  }
3503 };
3504 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3505 template<int R0, int R1, int VAR0, int VAR1>
3506 struct F128PermuteHelper2 {
3507  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT;
3508 };
3509 
3510 template<int R0, int R1>
3511 struct F128PermuteHelper2<R0, R1, 0, 0> {
3512  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3513  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3514  }
3515 };
3516 
3517 template<int R0, int R1>
3518 struct F128PermuteHelper2<R0, R1, 0, 1> {
3519  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3520  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3521  }
3522 };
3523 
3524 template<int R0, int R1>
3525 struct F128PermuteHelper2<R0, R1, 0, 2> {
3526  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3527  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3528  }
3529 };
3530 
3531 template<int R0, int R1>
3532 struct F128PermuteHelper2<R0, R1, 0, 3> {
3533  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3534  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3535  }
3536 };
3537 
3538 template<int R0, int R1>
3539 struct F128PermuteHelper2<R0, R1, 1, 0> {
3540  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3541  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3542  }
3543 };
3544 
3545 template<int R0, int R1>
3546 struct F128PermuteHelper2<R0, R1, 1, 1> {
3547  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3548  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3549  }
3550 };
3551 
3552 template<int R0, int R1>
3553 struct F128PermuteHelper2<R0, R1, 1, 2> {
3554  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3555  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3556  }
3557 };
3558 
3559 template<int R0, int R1>
3560 struct F128PermuteHelper2<R0, R1, 1, 3> {
3561  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3562  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3563  }
3564 };
3565 
3566 template<int R0, int R1>
3567 struct F128PermuteHelper2<R0, R1, 2, 0> {
3568  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3569  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3570  }
3571 };
3572 
3573 template<int R0, int R1>
3574 struct F128PermuteHelper2<R0, R1, 2, 1> {
3575  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3576  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3577  }
3578 };
3579 
3580 template<int R0, int R1>
3581 struct F128PermuteHelper2<R0, R1, 2, 2> {
3582  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3583  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3584  }
3585 };
3586 
3587 template<int R0, int R1>
3588 struct F128PermuteHelper2<R0, R1, 2, 3> {
3589  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3590  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3591  }
3592 };
3593 
3594 template<int R0, int R1>
3595 struct F128PermuteHelper2<R0, R1, 3, 0> {
3596  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3597  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3598  }
3599 };
3600 
3601 template<int R0, int R1>
3602 struct F128PermuteHelper2<R0, R1, 3, 1> {
3603  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3604  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3605  }
3606 };
3607 
3608 template<int R0, int R1>
3609 struct F128PermuteHelper2<R0, R1, 3, 2> {
3610  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3611  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3612  }
3613 };
3614 
3615 template<int R0, int R1>
3616 struct F128PermuteHelper2<R0, R1, 3, 3> {
3617  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3618  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3619  }
3620 };
3621 
3622 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3623 struct F128PermuteHelper {
3624  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3625  f128 ret;
3626  f32x2 x0 = a.vec.ps[0];
3627  f32x2 x1 = a.vec.ps[1];
3628  f32x2 x2 = b.vec.ps[0];
3629  f32x2 x3 = b.vec.ps[1];
3630  ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3631  ::Permute(x0, x1, x2, x3);
3632  ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3633  ::Permute(x0, x1, x2, x3);
3634  return ret;
3635  }
3636 };
3637 #else
3638 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3639 struct F128PermuteHelper {
3640  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3641  f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3642  F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3643  F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3644  F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3645  return ret;
3646  }
3647 };
3648 #endif
3649 
3650 template <int V0, int V1, int V2, int V3>
3651 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3652  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3653  NLIB_UNUSED(b);
3654  return F128::Swizzle<V0, V1, V2, V3>(a);
3655  }
3656 };
3657 
3658 template <int V0, int V1, int V2, int V3>
3659 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3660  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3661  NLIB_UNUSED(a);
3662  return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3663  }
3664 };
3665 
3666 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3667 // Permute specialization for SSE4.1
3668 template <>
3669 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3670  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3671  return _mm_unpacklo_ps(a, b);
3672  }
3673 };
3674 template <>
3675 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3676  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3677  return _mm_unpacklo_ps(b, a);
3678  }
3679 };
3680 template <>
3681 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3682  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3683  return _mm_unpackhi_ps(a, b);
3684  }
3685 };
3686 template <>
3687 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3688  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3689  return _mm_unpackhi_ps(b, a);
3690  }
3691 };
3692 #endif
3693 
3694 template<int V0, int V1, int V2, int V3>
3695 struct F128PermuteDontCareHelper {
3696  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3697  NLIB_STATIC_ASSERT(V0 < 8);
3698  NLIB_STATIC_ASSERT(V1 < 8);
3699  NLIB_STATIC_ASSERT(V2 < 8);
3700  NLIB_STATIC_ASSERT(V3 < 8);
3701  static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3702  static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3703  return detail::F128PermuteHelper< arg1, arg2,
3704  V0, V1, V2, V3 >::Permute(a, b);
3705  }
3706 };
3707 
3708 template<int V1, int V2, int V3>
3709 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3710  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3711  NLIB_STATIC_ASSERT(V1 < 8);
3712  NLIB_STATIC_ASSERT(V2 < 8);
3713  NLIB_STATIC_ASSERT(V3 < 8);
3714  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3715  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3716  }
3717 };
3718 
3719 template<int V0, int V2, int V3>
3720 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3721  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3722  NLIB_STATIC_ASSERT(V0 < 8);
3723  NLIB_STATIC_ASSERT(V2 < 8);
3724  NLIB_STATIC_ASSERT(V3 < 8);
3725  static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3726  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3727  }
3728 };
3729 
3730 template<int V0, int V1, int V3>
3731 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3732  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3733  NLIB_STATIC_ASSERT(V0 < 8);
3734  NLIB_STATIC_ASSERT(V1 < 8);
3735  NLIB_STATIC_ASSERT(V3 < 8);
3736  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3737  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3738  }
3739 };
3740 
3741 template<int V0, int V1, int V2>
3742 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3743  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3744  NLIB_STATIC_ASSERT(V0 < 8);
3745  NLIB_STATIC_ASSERT(V1 < 8);
3746  NLIB_STATIC_ASSERT(V2 < 8);
3747  static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3748  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3749  }
3750 };
3751 
3752 template<int V2, int V3>
3753 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3754  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3755  NLIB_STATIC_ASSERT(V2 < 8);
3756  NLIB_STATIC_ASSERT(V3 < 8);
3757  static const int V0 = (V2 < 4) ? 0 : 4;
3758  return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3759  }
3760 };
3761 
3762 template<int V1, int V2>
3763 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3764  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3765  NLIB_STATIC_ASSERT(V1 < 8);
3766  NLIB_STATIC_ASSERT(V2 < 8);
3767  static const int V0 = (V1 & 1) ? V1 - 1: V1;
3768  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3769  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3770  }
3771 };
3772 
3773 template<int V0, int V1>
3774 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3775  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3776  NLIB_STATIC_ASSERT(V0 < 8);
3777  NLIB_STATIC_ASSERT(V1 < 8);
3778  static const int V2 = (V1 < 4) ? 2 : 6;
3779  return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3780  }
3781 };
3782 
3783 template<int V0, int V3>
3784 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3785  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3786  NLIB_STATIC_ASSERT(V0 < 8);
3787  NLIB_STATIC_ASSERT(V3 < 8);
3788  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3789  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3790  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3791  }
3792 };
3793 
3794 template<int V0, int V2>
3795 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3796  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3797  NLIB_STATIC_ASSERT(V0 < 8);
3798  NLIB_STATIC_ASSERT(V2 < 8);
3799  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3800  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3801  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3802  }
3803 };
3804 
3805 template<int V1, int V3>
3806 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3807  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3808  NLIB_STATIC_ASSERT(V1 < 8);
3809  NLIB_STATIC_ASSERT(V3 < 8);
3810  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3811  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3812  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3813  }
3814 };
3815 
3816 template<int V>
3817 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3818  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3819  NLIB_STATIC_ASSERT(V < 8);
3820  static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3821  static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3822  static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3823  return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3824  }
3825 };
3826 
3827 template<int V>
3828 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3829  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3830  NLIB_STATIC_ASSERT(V < 8);
3831  static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3832  static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3833  static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3834  return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3835  }
3836 };
3837 
3838 template<int V>
3839 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3840  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3841  NLIB_STATIC_ASSERT(V < 8);
3842  static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3843  static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3844  static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3845  return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3846  }
3847 };
3848 
3849 template<int V>
3850 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3851  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3852  NLIB_STATIC_ASSERT(V < 8);
3853  static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3854  static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3855  static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3856  return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3857  }
3858 };
3859 
3860 template <>
3861 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3862  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3863  NLIB_UNUSED(b);
3864  return a;
3865  }
3866 };
3867 
3868 } // namespace detail
3869 
3870 template <int V0, int V1, int V2, int V3>
3871 // r[0] = V0 < 4 ? a[V0] : b[V0 - 4], .....
3872 NLIB_M(f128) F128::Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3873 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE)
3874  return __builtin_shufflevector(a, b,
3875  (V0 != 8 ? V0 : -1),
3876  (V1 != 8 ? V1 : -1),
3877  (V2 != 8 ? V2 : -1),
3878  (V3 != 8 ? V3 : -1));
3879 #else
3880  return detail::F128PermuteDontCareHelper <
3881  V0 != -1 ? V0 : 8,
3882  V1 != -1 ? V1 : 8,
3883  V2 != -1 ? V2 : 8,
3884  V3 != -1 ? V3 : 8>::Permute(a, b);
3885 #endif
3886 }
3887 
3888 template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
3889 // r[i] = InsertLane(i) ? insert_value[i] : value[i]
3890 // note that splat[0] = splat[1] = splat[2] = splat[3]
3891 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT {
3892 #if defined(NLIB_NEON)
3893  const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3894  const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3895  const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3896  const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3897 #else
3898  // SSE4.1 has _mm_blend_ps()
3899  const int v0 = SplatLane0 ? 4 : 0;
3900  const int v1 = SplatLane1 ? 5 : 1;
3901  const int v2 = SplatLane2 ? 6 : 2;
3902  const int v3 = SplatLane3 ? 7 : 3;
3903 #endif
3904  return F128::Permute<v0, v1, v2, v3>(value, splat);
3905 }
3906 
3907 NLIB_M2(f128) F128::Exp2(f128arg value) NLIB_NOEXCEPT {
3908 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
3909  f128 ret;
3910  ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3911  ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3912  ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3913  ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3914  return ret;
3915 #else
3916  i128 iround = F128::ConvertToI128Round(value);
3917  f128 fround = F128::ConvertFromI128(iround);
3918  f128 x = F128::Sub(value, fround);
3919  f128 xx = F128::Mult(x, x);
3920 
3921  f128 P = F128::LoadA16(F128::exp2_P_);
3922  f128 Q = F128::LoadA16(F128::exp2_Q_);
3923 
3924  f128 px;
3925  // px = P[0]
3926  // px = px * xx + P[1]
3927  // px = px * xx + P[2]
3928  // px = x * px;
3929  px = F128::MultAdd<0>(P, xx, F128::SetValue<1>(P, each_select32), each_select32);
3930  px = F128::MultAdd(px, xx, F128::SetValue<2>(P, each_select32));
3931  px = F128::Mult(x, px);
3932 
3933  f128 qx;
3934  // qx = xx + Q[0]
3935  // qx = qx * xx + Q[1]
3936  qx = F128::Add(xx, F128::SetValue<0>(Q, each_select32));
3937  qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q, each_select32));
3938 
3939  x = F128::Div(px, F128::Sub(qx, px));
3940  x = F128::MultAdd<3>(Q, x, F128::SetValue<2>(Q, each_select32), each_select32);
3941 
3942  // x = x * 2^iround
3943  iround = I128::Add32(iround, I128::SetValue(127, each_int32));
3944  iround = I128::ShiftLeftLogical32(iround, 23);
3945  x = F128::Mult(x, F128::CastFromI128(iround));
3946 
3947  // NOTE:
3948  // overflow not checked
3949  return x;
3950 #endif
3951 }
3952 
3953 NLIB_M(f128) F128::ExpE(f128arg value) NLIB_NOEXCEPT {
3954  static const float log2e = 1.44269504088896340736f;
3955  return Exp2(F128::Mult(log2e, value));
3956 }
3957 
3958 NLIB_M(f128) F128::SinH(f128arg value) NLIB_NOEXCEPT {
3959  static const float log2e = 1.44269504088896340736f;
3960  f128 neg_one = F128::SetValue(-1.f, each_float);
3961  f128 v0 = F128::MultAdd(log2e, value, neg_one);
3962  f128 v1 = F128::MultSub(log2e, value, neg_one);
3963  f128 e0 = Exp2(v0);
3964  f128 e1 = Exp2(v1);
3965  return F128::Sub(e0, e1);
3966 }
3967 
3968 NLIB_M(f128) F128::CosH(f128arg value) NLIB_NOEXCEPT {
3969  static const float log2e = 1.44269504088896340736f;
3970  f128 neg_one = F128::SetValue(-1.f, each_float);
3971  f128 v0 = F128::MultAdd(log2e, value, neg_one);
3972  f128 v1 = F128::MultSub(log2e, value, neg_one);
3973  f128 e0 = Exp2(v0);
3974  f128 e1 = Exp2(v1);
3975  return F128::Add(e0, e1);
3976 }
3977 
3978 NLIB_M(f128) F128::TanH(f128arg value) NLIB_NOEXCEPT {
3979  // 1 - 2 * (1 + expE(2x))
3980  f128 cvalue = F128::LoadA16(tanh_cvalue_);
3981  f128 e = F128::Mult<0>(cvalue, value, each_select32);
3982  e = F128::Exp2(e);
3983  f128 half = F128::SetValue<2>(cvalue, each_select32);
3984  e = F128::MultAdd(half, e, half);
3985  e = F128::Recp(e);
3986  return F128::Sub(F128::SetValue<1>(cvalue, each_select32), e);
3987 }
3988 
3989 NLIB_M2(f128) F128::Tan(f128arg value) NLIB_NOEXCEPT {
3990 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
3991  f128 ret;
3992  ret.vec.v[0] = tanf(value.vec.v[0]);
3993  ret.vec.v[1] = tanf(value.vec.v[1]);
3994  ret.vec.v[2] = tanf(value.vec.v[2]);
3995  ret.vec.v[3] = tanf(value.vec.v[3]);
3996  return ret;
3997 #else
3998  // Cody and Waite algorithm
3999  f128 C = F128::LoadA16(&F128::tan_c_[0]);
4000 
4001  // g = round(value / (pi/2))
4002  f128 g = F128::Round(F128::Mult<0>(C, value, each_select32));
4003  f128 nearx_axis;
4004  {
4005  i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U, each_uint32));
4006  i128 cmp = I128::CmpEq32(t0, I128::SetZero());
4007  nearx_axis = F128::CastFromI128(cmp);
4008  }
4009 
4010  // f = value - (pi/2) * g
4011  f128 f = F128::MultSub<1>(C, g, value, each_select32);
4012  f = F128::MultSub<2>(C, g, f, each_select32);
4013 
4014  f128 near_axis = F128::CmpNearEqZero(f, F128::SetValue<3>(C, each_select32));
4015 
4016  f128 P = F128::LoadA16(&F128::tan_p_[0]);
4017  f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4018 
4019  f128 ff = F128::Mult(f, f);
4020  f128 one = F128::SetValue<3>(P, each_select32);
4021 
4022  f128 p = F128::MultAdd<2>(P, ff, F128::SetValue<1>(P, each_select32), each_select32);
4023  p = F128::MultAdd(p, ff, F128::SetValue<0>(P, each_select32));
4024  p = F128::MultAdd(p, ff, one);
4025  p = F128::Mult(f, p);
4026 
4027  f128 q = F128::MultAdd<3>(Q, ff, F128::SetValue<2>(Q, each_select32), each_select32);
4028  q = F128::MultAdd(q, ff, F128::SetValue<1>(Q, each_select32));
4029  q = F128::MultAdd(q, ff, F128::SetValue<0>(Q, each_select32));
4030  q = F128::MultAdd(q, ff, one);
4031 
4032  p = F128::Select(near_axis, f, p);
4033  q = F128::Select(near_axis, one, q);
4034 
4035  f128 r0 = F128::Div(p, q);
4036  f128 r1 = F128::Negate(F128::Recp(r0));
4037 
4038  return F128::Select(nearx_axis, r0, r1);
4039 #endif
4040 }
4041 
4042 NLIB_M2(f128) F128::Log2(f128arg value) NLIB_NOEXCEPT {
4043 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
4044  static const float scale = 1.4426950408889634f; // 1 / LogE(2.0)
4045  f128 ret;
4046  ret.vec.v[0] = logf(value.vec.v[0]);
4047  ret.vec.v[1] = logf(value.vec.v[1]);
4048  ret.vec.v[2] = logf(value.vec.v[2]);
4049  ret.vec.v[3] = logf(value.vec.v[3]);
4050  return F128::Mult(scale, ret);
4051 #else
4052  // x = frexp(value, &e)
4053  f128 x = F128::And(F128::SetValue(0x807FFFFFU, each_uint32), value);
4054  x = F128::Or(F128::SetValue(127U << 23, each_uint32), x);
4055  i128 e = I128::And(I128::SetValue(0x7F800000U, each_uint32), F128::CastToI128(value));
4056  e = I128::ShiftRightLogical32(e, 23);
4057  e = I128::Sub32(e, I128::SetValue(127U, each_uint32));
4058 
4059  x = F128::Sub(x, F128::SetOne());
4060  f128 z = F128::Mult(x, x);
4061  f128 y;
4062 
4063  f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4064  f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4065  f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4066 
4067  f128 p = F128::SetValue<0>(pq0, each_select32);
4068  p = F128::MultAdd(p, x, F128::SetValue<1>(pq0, each_select32));
4069  p = F128::MultAdd(p, x, F128::SetValue<2>(pq0, each_select32));
4070  p = F128::MultAdd(p, x, F128::SetValue<3>(pq0, each_select32));
4071  p = F128::MultAdd(p, x, F128::SetValue<0>(pq1, each_select32));
4072  p = F128::MultAdd(p, x, F128::SetValue<1>(pq1, each_select32));
4073 
4074  f128 q = F128::Add(x, F128::SetValue<2>(pq1, each_select32));
4075  q = F128::MultAdd(q, x, F128::SetValue<3>(pq1, each_select32));
4076  q = F128::MultAdd(q, x, F128::SetValue<0>(pq2, each_select32));
4077  q = F128::MultAdd(q, x, F128::SetValue<1>(pq2, each_select32));
4078  q = F128::MultAdd(q, x, F128::SetValue<2>(pq2, each_select32));
4079 
4080  y = F128::Mult(z, p);
4081  y = F128::Div(y, q);
4082  y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4083 
4084  f128 result;
4085  {
4086  // do not optimize
4087  f128 log2ea = F128::SetValue<3>(pq2, each_select32);
4088  result = F128::Mult(y, log2ea);
4089  result = F128::MultAdd(log2ea, x, result);
4090  result = F128::Add(result, y);
4091  result = F128::Add(result, x);
4092  result = F128::Add(result, F128::ConvertFromI128(e));
4093  }
4094 
4095  {
4096  f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4097 
4098  // value is NaN -> NaN
4099  f128 is_nan = F128::IsNaN(value);
4100  f128 nan = F128::SetValue<0>(nan_inf, each_select32);
4101  result = F128::Select(is_nan, nan, result);
4102 
4103  f128 is_inf = F128::IsInfinite(value);
4104  f128 is_pos = F128::CmpGtZero(value);
4105 
4106  // value == inf -> +inf
4107  f128 inf = F128::SetValue<1>(nan_inf, each_select32);
4108  f128 is_pos_inf = F128::And(is_inf, is_pos);
4109  result = F128::Select(is_pos_inf, inf, result);
4110 
4111  // value == 0 -> -inf
4112  f128 neg_inf = F128::SetValue<3>(nan_inf, each_select32);
4113  f128 is_zero = F128::CmpEqZero(value);
4114  result = F128::Select(is_zero, neg_inf, result);
4115 
4116  // value < 0 -> -NaN
4117  f128 neg_nan = F128::SetValue<2>(nan_inf, each_select32);
4118  f128 is_neg = F128::CmpLtZero(value);
4119  result = F128::Select(is_neg, neg_nan, result);
4120 
4121  // otherwise -> Log2(value)
4122  }
4123 
4124  return result;
4125 #endif
4126 }
4127 
4128 NLIB_M(f128) F128::LogE(f128arg value) NLIB_NOEXCEPT {
4129 #ifdef NLIB_F128_SIMD_NOUSE
4130  f128 ret;
4131  ret.vec.v[0] = logf(value.vec.v[0]);
4132  ret.vec.v[1] = logf(value.vec.v[1]);
4133  ret.vec.v[2] = logf(value.vec.v[2]);
4134  ret.vec.v[3] = logf(value.vec.v[3]);
4135  return ret;
4136 #else
4137  f128 x = F128::Log2(value);
4138  static const float recp_log2e = 0.6931471805597018f;
4139  return F128::Mult(recp_log2e, x);
4140 #endif
4141 }
4142 
4143 #undef NLIB_M
4144 #undef NLIB_M2
4145 #endif // NLIB_DOXYGEN
4146 
4147 typedef f128 SimdVector;
4148 typedef f128arg SimdVectorArg;
4149 typedef f128 SimdQuaternion;
4150 typedef f128arg SimdQuaternionArg;
4151 typedef f128 SimdPlane;
4152 typedef f128arg SimdPlaneArg;
4153 typedef f128 SimdSphere;
4154 typedef f128arg SimdSphereArg;
4155 
4156 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4157 struct NLIB_ALIGNAS(16) SimdMatrix {
4158 #else
4159 struct SimdMatrix {
4160 #endif
4161  public:
4163  SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) NLIB_NOEXCEPT {
4164  r[0] = r0;
4165  r[1] = r1;
4166  r[2] = r2;
4167  r[3] = r3;
4168  }
4169  SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11, float m12,
4170  float m13, float m20, float m21, float m22, float m23, float m30, float m31,
4171  float m32, float m33) NLIB_NOEXCEPT;
4172  explicit SimdMatrix(const float* p) NLIB_NOEXCEPT;
4173 
4174  public:
4175  f128 r[4];
4176 };
4177 
4178 inline SimdMatrix::SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11,
4179  float m12, float m13, float m20, float m21, float m22, float m23,
4180  float m30, float m31, float m32, float m33) NLIB_NOEXCEPT {
4181  r[0] = F128::SetValue(m00, m01, m02, m03);
4182  r[1] = F128::SetValue(m10, m11, m12, m13);
4183  r[2] = F128::SetValue(m20, m21, m22, m23);
4184  r[3] = F128::SetValue(m30, m31, m32, m33);
4185 }
4186 
4187 inline SimdMatrix::SimdMatrix(const float* p) NLIB_NOEXCEPT {
4188  uintptr_t algn = reinterpret_cast<uintptr_t>(p) & 15;
4189  NLIB_ASSERT((algn & 3) == 0);
4190  switch (algn >> 2) {
4191  case 0:
4192  r[0] = F128::LoadA16(p);
4193  r[1] = F128::LoadA16(p + 4);
4194  r[2] = F128::LoadA16(p + 8);
4195  r[3] = F128::LoadA16(p + 12);
4196  break;
4197  case 1:
4198  r[0] = F128::LoadA4(p);
4199  r[1] = F128::LoadA4(p + 4);
4200  r[2] = F128::LoadA4(p + 8);
4201  r[3] = F128::LoadA4(p + 12);
4202  break;
4203  case 2:
4204  r[0] = F128::LoadA8(p);
4205  r[1] = F128::LoadA8(p + 4);
4206  r[2] = F128::LoadA8(p + 8);
4207  r[3] = F128::LoadA8(p + 12);
4208  break;
4209  default:
4210  NLIB_ASSUME(0);
4211  break;
4212  }
4213 }
4214 
4215 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
4216 typedef const SimdMatrix& SimdMatrixArg;
4217 #else
4218 typedef const SimdMatrix SimdMatrixArg;
4219 #endif
4220 
4221 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE)
4222 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4223  { \
4224  f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \
4225  f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \
4226  f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \
4227  f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \
4228  row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \
4229  row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \
4230  row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \
4231  row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \
4232  }
4233 #elif defined(NLIB_NEON)
4234 # ifdef __aarch64__
4235 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4236  { \
4237  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4238  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4239  uint64x2_t row0_, row1_, row2_, row3_; \
4240  row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4241  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4242  row0 = vreinterpretq_f32_u64(row0_); \
4243  row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4244  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4245  row1 = vreinterpretq_f32_u64(row1_); \
4246  row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4247  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4248  row2 = vreinterpretq_f32_u64(row2_); \
4249  row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4250  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4251  row3 = vreinterpretq_f32_u64(row3_); \
4252  }
4253 # else
4254 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4255  { \
4256  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4257  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4258  row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \
4259  row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \
4260  row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \
4261  row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \
4262  }
4263 # endif
4264 #elif defined(CAFE)
4265 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4266  { \
4267  f32x2 tmp0, tmp1; \
4268  tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \
4269  tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \
4270  row0.vec.ps[0] = tmp0; \
4271  row1.vec.ps[0] = tmp1; \
4272  tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \
4273  tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \
4274  row2.vec.ps[1] = tmp0; \
4275  row3.vec.ps[1] = tmp1; \
4276  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4277  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4278  row0.vec.ps[1] = row2.vec.ps[0]; \
4279  row1.vec.ps[1] = row3.vec.ps[0]; \
4280  row2.vec.ps[0] = tmp0; \
4281  row3.vec.ps[0] = tmp1; \
4282  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4283  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4284  row0.vec.ps[1] = tmp0; \
4285  row1.vec.ps[1] = tmp1; \
4286  }
4287 #endif
4288 
4290  float x;
4291  float y;
4292  float z;
4293 };
4294 
4296  float x;
4297  float y;
4298  float z;
4299  float w;
4300 };
4301 
4302 struct NLIB_VIS_PUBLIC Float2x3 {
4303  float m[2][3];
4304 };
4305 
4307  float m[3][3];
4308 };
4309 
4310 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4311 struct NLIB_ALIGNAS(16) Float3x4 {
4312 #else
4313 struct Float3x4 {
4314 #endif
4315  float m[3][4];
4316 };
4317 
4318 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4319 struct NLIB_ALIGNAS(16) Float4x3 {
4320 #else
4321 struct Float4x3 {
4322 #endif
4323  float m[4][3];
4324 };
4325 
4326 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4327 struct NLIB_ALIGNAS(16) Float4x4 {
4328 #else
4329 struct Float4x4 {
4330 #endif
4331  float m[4][4];
4332 };
4333 
4334 } // namespace simd
4335 NLIB_NAMESPACE_END
4336 
4337 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
float x
The x-coordinate of the 3D vector.
Definition: SimdFloat.h:4290
SimdMatrix()
Instantiates the object with default parameters (default constructor).
Definition: SimdFloat.h:4162
The class with the collection of functions that handle 4x4 matrices.
Definition: SimdMatrix.h:28
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:97
Class representing the view frustum.
Definition: SimdGeometry.h:118
The class with the collection of functions that handle quaternions.
f128arg SimdVectorArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4148
The type for two SIMD registers for 128-bit, single-precision, floating-point numbers.
Definition: SimdFloat.h:48
float x
The x-coordinate for the 4D vector.
Definition: SimdFloat.h:4296
float y
The y-coordinate of the 4D vector.
Definition: SimdFloat.h:4297
Implements the class and functions for SIMD computations on integers.
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
Sets up the matrix from the parameters.
Definition: SimdFloat.h:4163
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
Definition: SimdInt.h:57
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:88
f128arg SimdSphereArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4154
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:89
Class for representing oriented bounding boxes (OBB). This class has data members to hold the center ...
Definition: SimdGeometry.h:93
static f128 ShiftRight(f128arg a, f128arg b) noexcept
Sets the elements of b in shifted order to the portion of a that becomes empty when a is shifted to t...
Definition: SimdFloat.h:344
The tag for representing a single-precision floating-point number with an empty structure.
Definition: SimdFloat.h:67
#define NLIB_ASSUME(cond)
Indicates that cond is true and provides tips for optimizing the compiler.
Definition: Platform.h:264
The class with the collection of functions that determine containment relations.
Definition: SimdGeometry.h:284
The class with the collection of static member functions that handle spheres in three-dimensional spa...
Definition: SimdGeometry.h:53
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
Definition: SimdFloat.h:68
f128arg SimdQuaternionArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4150
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:74
The class with the collection of functions that handle planes in three-dimensional space...
Definition: SimdGeometry.h:34
f128arg SimdPlaneArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4152
The class with the collection of functions that perform calculations on three-dimensional vectors...
Definition: SimdVector3.h:26
static f128 RotateLeft(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the left by the amount of N...
Definition: SimdFloat.h:331
The class with the collection of functions that perform square-of-distance calculations.
Definition: SimdGeometry.h:147
The type for reading and writing four-dimensional vectors in memory. Keeps float-type x...
Definition: SimdFloat.h:4295
const f128 f128arg
const f128 or const f128& is defined using typedef.
Definition: SimdFloat.h:78
The structure for keeping a 4x4 matrix.
Definition: SimdFloat.h:4159
float z
The z-coordinate of the 4D vector.
Definition: SimdFloat.h:4298
nlib_f128x2_t f128x2
nlib_f128x2_t is defined using typedef.
Definition: SimdFloat.h:73
f128 SimdSphere
f128 is defined using typedef. Used when handling spheres.
Definition: SimdFloat.h:4153
The class for single-precision floating point SIMD computations using128-bit registers (MM0-XMM15 for...
Definition: SimdFloat.h:91
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:105
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
Definition: Config.h:107
A file that contains the configuration information for each development environment.
The class with the collection of functions that perform calculations on four-dimensional vectors...
Definition: SimdVector4.h:24
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
Definition: SimdFloat.h:4289
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:255
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:47
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:63
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
Definition: SimdFloat.h:4321
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
Definition: SimdFloat.h:4306
The tag for representing an unsigned 32-bit integer with an empty structure.
Definition: SimdInt.h:43
static f128 RotateRight(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the right by the amount of N...
Definition: SimdFloat.h:338
float y
The y-coordinate of the 3D vector.
Definition: SimdFloat.h:4291
nlib_f128_t f128
nlib_f128_t is defined using typedef.
Definition: SimdFloat.h:71
float z
The z-coordinate of the 3D vector.
Definition: SimdFloat.h:4292
Class for representing axis-aligned bounding boxes (AABB). The class has data members to hold the min...
Definition: SimdGeometry.h:74
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
Definition: SimdInt.h:49
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:170
float w
The w-coordinate of the 4D vector.
Definition: SimdFloat.h:4299
The class with the collection of functions that determine intersections.
Definition: SimdGeometry.h:191
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
Definition: SimdFloat.h:4149
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
Definition: SimdFloat.h:4329
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
Definition: SimdFloat.h:4313
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
Definition: SimdFloat.h:4151
__m128 nlib_f128_t
The type for a SIMD register for 128-bit, single-precision, floating-point numbers.
Definition: SimdFloat.h:47
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...
Definition: SimdFloat.h:4147