nlib
SimdFloat.h
[詳解]
1 
2 /*---------------------------------------------------------------------------*
3 
4  Project: CrossRoad
5  Copyright (C)2012-2016 Nintendo. All rights reserved.
6 
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law. They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12 
13  *---------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
18 
19 #ifdef NN_PLATFORM_CTR
20 # ifndef __USE_C99_MATH
21 # define __USE_C99_MATH
22 # endif
23 #endif
24 #include <math.h>
25 #include <float.h>
26 
27 #include "nn/nlib/Config.h"
28 #include "nn/nlib/simd/SimdInt.h"
29 
30 #ifndef INFINITY
31 #define INFINITY ((float)(1e+300 * 1e+300))
32 #endif
33 
34 #if !defined(NLIB_SIMD) && !defined(CAFE)
35 #define NLIB_F128_SIMD_NOUSE
36 #endif
37 
38 #ifdef NLIB_F128_SIMD_NOUSE
39 typedef struct {
40  union {
41  float v[4];
42  uint32_t u[4];
43  } vec;
44 } nlib_f128_t;
45 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
46 #elif defined(NLIB_SSE41)
47 typedef __m128 nlib_f128_t;
48 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
49 #elif defined(NLIB_NEON)
50 typedef float32x4_t nlib_f128_t;
51 typedef float32x4x2_t nlib_f128x2_t;
52 #elif defined(CAFE)
53 typedef struct {
54  union {
55  f32x2 ps[2];
56  float v[4];
57  uint32_t u[4];
58  } vec;
59 } nlib_f128_t;
60 typedef struct { nlib_f128_t val[2]; } nlib_f128x2_t;
61 #endif
62 
63 NLIB_NAMESPACE_BEGIN
64 namespace simd {
65 
66 // use each_float for the argument value
67 struct each_float_tag {};
69 
70 // __m128(SSE), float32x4_t(NEON)
71 typedef nlib_f128_t f128;
72 // float32x4x2_t(NEON)
74 
75 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
76 typedef const f128& f128arg;
77 #else
78 typedef const f128 f128arg;
79 #endif
80 
81 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
82 typedef const f128& f128arg_ex;
83 #else
84 typedef const f128 f128arg_ex;
85 #endif
86 
87 #if !defined(_MSC_VER) || _MSC_VER < 1800
88 #ifndef __vectorcall
89 #define __vectorcall
90 #endif
91 #endif
92 
94  public:
95  static f128 __vectorcall SetValue(float v, each_float_tag) NLIB_NOEXCEPT;
96  static f128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
97  static f128 __vectorcall SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT;
98  template <size_t N>
99  static f128 __vectorcall SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT;
100  static f128 __vectorcall SetZero() NLIB_NOEXCEPT;
101  static f128 __vectorcall Set1000() NLIB_NOEXCEPT;
102  static f128 __vectorcall Set0100() NLIB_NOEXCEPT;
103  static f128 __vectorcall Set0010() NLIB_NOEXCEPT;
104  static f128 __vectorcall Set0001() NLIB_NOEXCEPT;
105  template <size_t N>
106  static f128 __vectorcall SetZeroToLane(f128arg value) NLIB_NOEXCEPT;
107  static f128 __vectorcall SetOne() NLIB_NOEXCEPT;
108  static f128 __vectorcall SetNegativeOne() NLIB_NOEXCEPT;
109  static f128 __vectorcall SetEpsilon() NLIB_NOEXCEPT;
110  static f128 __vectorcall SetInfinity() NLIB_NOEXCEPT;
111  static f128 __vectorcall SetNaN() NLIB_NOEXCEPT;
112  static f128 __vectorcall SetSignMask() NLIB_NOEXCEPT;
113 
114  static f128 __vectorcall LoadA16(const float* p) NLIB_NOEXCEPT;
115  static f128 __vectorcall LoadA8(const float* p) NLIB_NOEXCEPT;
116  static f128 __vectorcall LoadA4(const float* p) NLIB_NOEXCEPT;
117  static f128 __vectorcall LoadA16(uintptr_t p) NLIB_NOEXCEPT;
118  static f128 __vectorcall LoadA8(uintptr_t p) NLIB_NOEXCEPT;
119  static f128 __vectorcall LoadA4(uintptr_t p) NLIB_NOEXCEPT;
120  static f128 __vectorcall LoadA16(intptr_t p) NLIB_NOEXCEPT;
121  static f128 __vectorcall LoadA8(intptr_t p) NLIB_NOEXCEPT;
122  static f128 __vectorcall LoadA4(intptr_t p) NLIB_NOEXCEPT;
123 
124  static void __vectorcall StoreA16(float* p, f128arg value) NLIB_NOEXCEPT;
125  static void __vectorcall StoreA8(float* p, f128arg value) NLIB_NOEXCEPT;
126  static void __vectorcall StoreA4(float* p, f128arg value) NLIB_NOEXCEPT;
127  static void __vectorcall StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
128  static void __vectorcall StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
129  static void __vectorcall StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
130  static void __vectorcall StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT;
131  static void __vectorcall StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
132  static void __vectorcall StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
133 
134  static void __vectorcall StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT;
135  static void __vectorcall StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT;
136  static void __vectorcall StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
137  static void __vectorcall StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
138  static void __vectorcall StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
139  static void __vectorcall StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
140 
141  static void __vectorcall StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT;
142  static void __vectorcall StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT;
143  static void __vectorcall StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
144  static void __vectorcall StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
145  static void __vectorcall StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
146  static void __vectorcall StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
147 
148  /*
149  static f128 __vectorcall ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT;
150  static f128 __vectorcall ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT;
151  static f128 __vectorcall ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT;
152  */
153 
154 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
155  static f128 __vectorcall ConvertFromI128(i128 value) NLIB_NOEXCEPT;
156  static i128 __vectorcall ConvertToI128Round(f128 value) NLIB_NOEXCEPT;
157  static i128 __vectorcall ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT;
158 
159  static f128 __vectorcall CastFromI128(i128 value) NLIB_NOEXCEPT;
160  static i128 __vectorcall CastToI128(f128 value) NLIB_NOEXCEPT;
161 
162  template<int N>
163  static f128 __vectorcall ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT;
164  template<int N>
165  static i128 __vectorcall ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT;
166 #endif
167 
168  static f128 __vectorcall Add(f128arg a, f128arg b) NLIB_NOEXCEPT;
169  static f128 __vectorcall Sub(f128arg a, f128arg b) NLIB_NOEXCEPT;
170  static f128 __vectorcall Mult(f128arg a, f128arg b) NLIB_NOEXCEPT;
171  static f128 __vectorcall Mult(float a, f128arg b) NLIB_NOEXCEPT;
172  template <size_t N>
173  static f128 __vectorcall Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT;
174  static f128 __vectorcall Div(f128arg a, f128arg b) NLIB_NOEXCEPT;
175  static f128 __vectorcall Negate(f128arg value) NLIB_NOEXCEPT;
176  template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
177  static f128 __vectorcall NegateEx(f128arg value) NLIB_NOEXCEPT;
178  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
179  static f128 __vectorcall MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
180  template <size_t N>
181  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
183  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
184  static f128 __vectorcall MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
185  template <size_t N>
186  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
188  static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT;
189  static f128 __vectorcall Abs(f128arg value) NLIB_NOEXCEPT;
190  static f128 __vectorcall AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT;
191 
192  //
193  // Min/Max
194  //
195 
196  static f128 __vectorcall Max(f128arg a, f128arg b) NLIB_NOEXCEPT;
197  static f128 __vectorcall Min(f128arg a, f128arg b) NLIB_NOEXCEPT;
198  static f128 __vectorcall PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT;
199  static f128 __vectorcall PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT;
200  static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT;
201  static f128 __vectorcall Saturate(f128arg value) NLIB_NOEXCEPT;
202 
203  //
204  // Reciprocal/Sqrt
205  //
206 
207  static f128 __vectorcall Recp(f128arg value) NLIB_NOEXCEPT;
208  static f128 __vectorcall RecpEst(f128arg value) NLIB_NOEXCEPT;
209  static f128 __vectorcall Sqrt(f128arg value) NLIB_NOEXCEPT;
210  static f128 __vectorcall SqrtEst(f128arg value) NLIB_NOEXCEPT;
211  static f128 __vectorcall RecpSqrt(f128arg value) NLIB_NOEXCEPT;
212  static f128 __vectorcall RecpSqrtEst(f128arg value) NLIB_NOEXCEPT;
213 
214  //
215  // Round/Truncate
216  //
217 
218  static f128 __vectorcall Round(f128arg value) NLIB_NOEXCEPT;
219  static f128 __vectorcall Truncate(f128arg value) NLIB_NOEXCEPT;
220  static f128 __vectorcall Floor(f128arg value) NLIB_NOEXCEPT;
221  static f128 __vectorcall Ceil(f128arg value) NLIB_NOEXCEPT;
222 
223  //
224  // Logical Operations
225  //
226 
227  static f128 __vectorcall And(f128arg a, f128arg b) NLIB_NOEXCEPT;
228  static f128 __vectorcall Or(f128arg a, f128arg b) NLIB_NOEXCEPT;
229  static f128 __vectorcall Xor(f128arg a, f128arg b) NLIB_NOEXCEPT;
230  static f128 __vectorcall Not(f128arg a) NLIB_NOEXCEPT;
231  static f128 __vectorcall AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
232  static f128 __vectorcall OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
233 
234  //
235  // Comparison Operations
236  //
237 
238  static f128 __vectorcall CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT;
239  static f128 __vectorcall CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT;
240  static f128 __vectorcall CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT;
241  static f128 __vectorcall CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT;
242  static f128 __vectorcall CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT;
243  static f128 __vectorcall CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT;
244  static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT;
245  static f128 __vectorcall InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT;
246 
247  static f128 __vectorcall CmpEqZero(f128arg value) NLIB_NOEXCEPT;
248  static f128 __vectorcall CmpLtZero(f128arg value) NLIB_NOEXCEPT;
249  static f128 __vectorcall CmpLeZero(f128arg value) NLIB_NOEXCEPT;
250  static f128 __vectorcall CmpGtZero(f128arg value) NLIB_NOEXCEPT;
251  static f128 __vectorcall CmpGeZero(f128arg value) NLIB_NOEXCEPT;
252  static f128 __vectorcall CmpNeZero(f128arg value) NLIB_NOEXCEPT;
253  static f128 __vectorcall CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT;
254 
255  //
256  // Trigonometric function
257  //
258  static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
259  static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
260  static f128 __vectorcall ModAngle(f128arg value) NLIB_NOEXCEPT;
261  static f128 __vectorcall Sin(f128arg value) NLIB_NOEXCEPT;
262  static f128 __vectorcall Cos(f128arg value) NLIB_NOEXCEPT;
263  static f128x2 __vectorcall SinCos(f128arg value) NLIB_NOEXCEPT;
264  static f128 __vectorcall Tan(f128arg value) NLIB_NOEXCEPT;
265  static f128 __vectorcall SinH(f128arg value) NLIB_NOEXCEPT;
266  static f128 __vectorcall CosH(f128arg value) NLIB_NOEXCEPT;
267  static f128 __vectorcall TanH(f128arg value) NLIB_NOEXCEPT;
268  static f128 __vectorcall ArcSin(f128arg value) NLIB_NOEXCEPT;
269  static f128 __vectorcall ArcCos(f128arg value) NLIB_NOEXCEPT;
270  static f128 __vectorcall ArcTan(f128arg value) NLIB_NOEXCEPT;
271  static f128 __vectorcall ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT;
272  // and Est version needed?
273 
274  //
275  // Interpolation
276  //
277 
278  static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT;
279  static f128 __vectorcall
280  Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t) NLIB_NOEXCEPT;
281  static f128 __vectorcall
282  CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t) NLIB_NOEXCEPT;
283  static f128 __vectorcall
284  BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g) NLIB_NOEXCEPT;
285 
286  //
287  // Exp/Log
288  //
289  static f128 __vectorcall Exp2(f128arg value) NLIB_NOEXCEPT;
290  static f128 __vectorcall ExpE(f128arg value) NLIB_NOEXCEPT;
291  static f128 __vectorcall Log2(f128arg value) NLIB_NOEXCEPT;
292  static f128 __vectorcall LogE(f128arg value) NLIB_NOEXCEPT; // not implemented
293 
294  //
295  // Misc
296  //
297 
298  static int __vectorcall MoveMask(f128arg value) NLIB_NOEXCEPT;
299  static bool __vectorcall IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT;
300  static bool __vectorcall IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT;
301  static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT;
302  static f128 __vectorcall IsNaN(f128arg value) NLIB_NOEXCEPT;
303  static f128 __vectorcall IsInfinite(f128arg value) NLIB_NOEXCEPT;
304 
305  //
306  // Get/Set
307  //
308 
309  template <size_t N>
310  static float __vectorcall GetFloatFromLane(f128arg value) NLIB_NOEXCEPT;
311  template <size_t N>
312  static uint32_t __vectorcall GetUint32FromLane(f128arg value) NLIB_NOEXCEPT;
313  static float __vectorcall GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
314  static uint32_t __vectorcall GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
315 
316  template <size_t N>
317  static f128 __vectorcall SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT;
318  static f128 __vectorcall SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT;
319 
320  //
321  // Swizzle/Permute
322  //
323 
324  template <int V0, int V1, int V2, int V3>
325  static f128 __vectorcall Swizzle(f128arg value) NLIB_NOEXCEPT;
326  template <int V0, int V1, int V2, int V3>
327  static f128 __vectorcall Permute(f128arg a, f128arg b) NLIB_NOEXCEPT;
328  template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
329  static f128 __vectorcall Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT;
330 
331  template <size_t N>
332  // left <- value[3], value[2], value[1], value[0] -> right
333  static f128 __vectorcall RotateLeft(f128arg value) NLIB_NOEXCEPT {
334  NLIB_STATIC_ASSERT(N < 4);
335  const size_t NN = 4 - N;
336  return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
337  }
338  template <size_t N>
339  // left <- value[3], value[2], value[1], value[0] -> right
340  static f128 __vectorcall RotateRight(f128arg value) NLIB_NOEXCEPT {
341  NLIB_STATIC_ASSERT(N < 4);
342  return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
343  }
344  template <size_t N>
345  // left <- b[3], ..., b[0], a[3], ..., a[0] -> right
346  static f128 __vectorcall ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
347  NLIB_STATIC_ASSERT(N < 4);
348  return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
349  }
350 
351  private:
352  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v1000_[4]; // 1.f, 0.f, 0.f, 0.f
353  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0100_[4]; // 0.f, 1.f, 0.f, 0.f
354  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0010_[4]; // 0.f, 0.f, 1.f, 0.f
355  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0001_[4]; // 0.f, 0.f, 0.f, 1.f
356 
357  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float pi_values_[4]; // pi, -pi, 2pi, pi/2
358  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const uint32_t nan_inf_[4]; // nan, inf, -nan, -inf
359 
360  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R0_[4];
361  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R1_[4];
362 
363  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R0_[4];
364  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R1_[4];
365  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R2_[4];
366 
367  // Those coefficients are from Cephes Math Library
368  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_cvalue_[4];
369  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_coeff_[4];
370  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_cvalue_[4];
371  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_coeff_[4];
372  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan_coeff_[8];
373  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan2_cvalue_[4];
374 
375  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_P_[4];
376  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_Q_[4];
377  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tanh_cvalue_[4];
378  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_p_[4];
379  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_q_[4];
380  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_c_[4];
381  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float log2_PQ_[12];
382 
383  F128(); // forbidden
384  friend class Vector3;
385  friend class Vector4;
386  friend class Matrix;
387  friend class Plane;
388  friend class Quaternion;
389  friend class Sphere;
390  friend class AxisAlignedBox;
391  friend class OrientedBox;
392  friend class Frustum;
393 
394  friend class DistanceSq;
395  friend class Intersection;
396  friend class Containment;
397 };
398 
399 #ifndef NLIB_DOXYGEN
400 
401 #undef NLIB_M
402 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
403 #define NLIB_M2(tp) inline tp __vectorcall
404 
405 // r[i] = v
406 NLIB_M(f128) F128::SetValue(float v, each_float_tag) NLIB_NOEXCEPT {
407 #ifdef NLIB_F128_SIMD_NOUSE
408  f128 ret;
409  ret.vec.v[0] = v;
410  ret.vec.v[1] = v;
411  ret.vec.v[2] = v;
412  ret.vec.v[3] = v;
413  return ret;
414 #elif defined(NLIB_SSE41)
415  return _mm_set1_ps(v);
416 #elif defined(NLIB_NEON)
417  return vdupq_n_f32(v);
418 #elif defined(CAFE)
419  f128 ret;
420  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
421  return ret;
422 #endif
423 }
424 
425 // r[i] = *reinterpret_cast<float*>(&v)
426 NLIB_M(f128) F128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
427 #ifdef NLIB_F128_SIMD_NOUSE
428  f128 ret;
429  ret.vec.u[0] = v;
430  ret.vec.u[1] = v;
431  ret.vec.u[2] = v;
432  ret.vec.u[3] = v;
433  return ret;
434 #elif defined(NLIB_SSE41)
435  union {
436  float f32;
437  uint32_t u32;
438  } tmp;
439  tmp.u32 = v;
440  return _mm_set1_ps(tmp.f32);
441 #elif defined(NLIB_NEON)
442  uint32x4_t tmp = vdupq_n_u32(v);
443  return vreinterpretq_f32_u32(tmp);
444 #elif defined(CAFE)
445  union {
446  float f32;
447  uint32_t u32;
448  } tmp;
449  tmp.u32 = v;
450  f128 ret;
451  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
452  return ret;
453 #endif
454 }
455 
456 // r[0] = a, r[1] = b, r[2] = c, r[3] = d
457 NLIB_M(f128) F128::SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT {
458 #ifdef NLIB_F128_SIMD_NOUSE
459  f128 ret;
460  ret.vec.v[0] = a;
461  ret.vec.v[1] = b;
462  ret.vec.v[2] = c;
463  ret.vec.v[3] = d;
464  return ret;
465 #elif defined(NLIB_SSE41)
466  return _mm_set_ps(d, c, b, a);
467 #elif defined(NLIB_NEON)
468  union {
469  float f32[2];
470  uint64_t u64;
471  } tmp1, tmp2;
472  tmp1.f32[0] = a;
473  tmp1.f32[1] = b;
474  tmp2.f32[0] = c;
475  tmp2.f32[1] = d;
476  return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
477 #elif defined(CAFE)
478  f128 ret;
479  ret.vec.ps[0][0] = a;
480  ret.vec.ps[0][1] = b;
481  ret.vec.ps[1][0] = c;
482  ret.vec.ps[1][1] = d;
483  return ret;
484 #endif
485 }
486 
487 template <size_t N>
488 // r[i] = value[N]
489 NLIB_M(f128) F128::SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
490  NLIB_STATIC_ASSERT(N < 4);
491 #ifdef NLIB_F128_SIMD_NOUSE
492  f128 ret;
493  ret.vec.v[0] = value.vec.v[N];
494  ret.vec.v[1] = value.vec.v[N];
495  ret.vec.v[2] = value.vec.v[N];
496  ret.vec.v[3] = value.vec.v[N];
497  return ret;
498 #elif defined(NLIB_SSE41)
499  return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
500 #elif defined(NLIB_NEON)
501  float32x2_t tmp = vget_low_f32(value);
502  return vdupq_lane_f32(tmp, N);
503 #elif defined(CAFE)
504  f128 ret;
505  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
506  return ret;
507 #endif
508 }
509 
510 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
511 template <>
512 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
513  float32x2_t tmp = vget_high_f32(value);
514  return vdupq_lane_f32(tmp, 0);
515 }
516 template <>
517 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
518  float32x2_t tmp = vget_high_f32(value);
519  return vdupq_lane_f32(tmp, 1);
520 }
521 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
522 template <>
523 NLIB_M(f128) F128::SetValue<0>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
524  f128 ret;
525  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
526  return ret;
527 }
528 template <>
529 NLIB_M(f128) F128::SetValue<1>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
530  f128 ret;
531  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
532  return ret;
533 }
534 template <>
535 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
536  f128 ret;
537  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
538  return ret;
539 }
540 template <>
541 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
542  f128 ret;
543  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
544  return ret;
545 }
546 #endif
547 
548 // r[i] = 0.f
549 NLIB_M(f128) F128::SetZero() NLIB_NOEXCEPT {
550 #ifdef NLIB_F128_SIMD_NOUSE
551  f128 ret;
552  ret.vec.v[0] = 0;
553  ret.vec.v[1] = 0;
554  ret.vec.v[2] = 0;
555  ret.vec.v[3] = 0;
556  return ret;
557 #elif defined(NLIB_SSE41)
558  return _mm_setzero_ps();
559 #elif defined(NLIB_NEON)
560  return vdupq_n_f32(0);
561 #elif defined(CAFE)
562  f128 ret;
563  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
564  return ret;
565 #endif
566 }
567 
568 NLIB_M(f128) F128::Set1000() NLIB_NOEXCEPT {
569 #ifdef NLIB_F128_SIMD_NOUSE
570  f128 ret;
571  ret.vec.v[0] = 1.f;
572  ret.vec.v[1] = 0;
573  ret.vec.v[2] = 0;
574  ret.vec.v[3] = 0;
575  return ret;
576 #elif defined(NLIB_NEON)
577  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
578  float32x2_t x00 = vcreate_f32(0ULL);
579  return vcombine_f32(x10, x00);
580 #else
581  return F128::LoadA16(F128::v1000_);
582 #endif
583 }
584 
585 NLIB_M(f128) F128::Set0100() NLIB_NOEXCEPT {
586 #ifdef NLIB_F128_SIMD_NOUSE
587  f128 ret;
588  ret.vec.v[0] = 0;
589  ret.vec.v[1] = 1.f;
590  ret.vec.v[2] = 0;
591  ret.vec.v[3] = 0;
592  return ret;
593 #elif defined(NLIB_NEON)
594  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
595  float32x2_t x00 = vcreate_f32(0ULL);
596  return vcombine_f32(x01, x00);
597 #else
598  return F128::LoadA16(F128::v0100_);
599 #endif
600 }
601 
602 NLIB_M(f128) F128::Set0010() NLIB_NOEXCEPT {
603 #ifdef NLIB_F128_SIMD_NOUSE
604  f128 ret;
605  ret.vec.v[0] = 0;
606  ret.vec.v[1] = 0;
607  ret.vec.v[2] = 1.f;
608  ret.vec.v[3] = 0;
609  return ret;
610 #elif defined(NLIB_NEON)
611  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
612  float32x2_t x00 = vcreate_f32(0ULL);
613  return vcombine_f32(x00, x10);
614 #else
615  return F128::LoadA16(F128::v0010_);
616 #endif
617 }
618 
619 NLIB_M(f128) F128::Set0001() NLIB_NOEXCEPT {
620 #ifdef NLIB_F128_SIMD_NOUSE
621  f128 ret;
622  ret.vec.v[0] = 0;
623  ret.vec.v[1] = 0;
624  ret.vec.v[2] = 0;
625  ret.vec.v[3] = 1.f;
626  return ret;
627 #elif defined(NLIB_NEON)
628  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
629  float32x2_t x00 = vcreate_f32(0ULL);
630  return vcombine_f32(x00, x01);
631 #else
632  return F128::LoadA16(F128::v0001_);
633 #endif
634 }
635 
636 template <size_t N>
637 // r = value, r[N] = 0.f
638 NLIB_M(f128) F128::SetZeroToLane(f128arg value) NLIB_NOEXCEPT {
639  NLIB_STATIC_ASSERT(N < 4);
640 #ifdef NLIB_F128_SIMD_NOUSE
641  f128 ret = value;
642  ret.vec.v[N] = 0.f;
643  return ret;
644 #elif defined(NLIB_SSE41)
645  return _mm_insert_ps(value, value, 1 << N);
646 #elif defined(NLIB_NEON)
647  return F128::Permute<N == 0 ? 4 : 0,
648  N == 1 ? 5 : 1,
649  N == 2 ? 6 : 2,
650  N == 3 ? 7 : 3>(value, vdupq_n_f32(0.f));
651  // return vsetq_lane_f32(0.f, value, N);
652 #elif defined(CAFE)
653  f128 ret = value;
654  ret.vec.ps[N / 2][N % 2] = 0.f;
655  return ret;
656 #endif
657 }
658 
659 // r[i] = 1.f
660 NLIB_M(f128) F128::SetOne() NLIB_NOEXCEPT {
661  return F128::SetValue(1.f, each_float);
662 }
663 
664 // r[i] = -1.f
665 NLIB_M(f128) F128::SetNegativeOne() NLIB_NOEXCEPT {
666  return F128::SetValue(-1.f, each_float);
667 }
668 
669 // r[i] = 1.0e-7f
670 NLIB_M(f128) F128::SetEpsilon() NLIB_NOEXCEPT {
671  return F128::SetValue(1.0e-7f, each_float);
672 }
673 
674 // r[i] = 0x7F800000U
675 NLIB_M(f128) F128::SetInfinity() NLIB_NOEXCEPT {
676  return F128::SetValue(0x7F800000U, each_uint32);
677 }
678 
679 // r[i] = 0x7FC00000U
680 NLIB_M(f128) F128::SetNaN() NLIB_NOEXCEPT {
681  return F128::SetValue(0x7FC00000U, each_uint32);
682 }
683 
684 // r[i] = -0.f(0x80000000U)
685 NLIB_M(f128) F128::SetSignMask() NLIB_NOEXCEPT {
686  return F128::SetValue(-0.f, each_float);
687 }
688 
689 // r[i] = p[i], p is 16 bytes aligned
690 NLIB_M(f128) F128::LoadA16(const float* p) NLIB_NOEXCEPT {
691 #ifdef NLIB_F128_SIMD_NOUSE
692  f128 ret;
693  ret.vec.v[0] = p[0];
694  ret.vec.v[1] = p[1];
695  ret.vec.v[2] = p[2];
696  ret.vec.v[3] = p[3];
697  return ret;
698 #elif defined(NLIB_SSE41)
699  return _mm_load_ps(p);
700 #elif defined(NLIB_NEON)
701  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
702  uint64x2_t val = vld1q_u64(tmp);
703  return vreinterpretq_f32_u64(val);
704 #elif defined(CAFE)
705  f128 ret;
706  ret.vec.ps[0][0] = p[0];
707  ret.vec.ps[0][1] = p[1];
708  ret.vec.ps[1][0] = p[2];
709  ret.vec.ps[1][1] = p[3];
710  return ret;
711 #endif
712 }
713 
714 // r[i] = p[i], p is 4 bytes aligned
715 NLIB_M(f128) F128::LoadA4(const float* p) NLIB_NOEXCEPT {
716 #ifdef NLIB_F128_SIMD_NOUSE
717  return LoadA16(p);
718 #elif defined(NLIB_SSE41)
719  return _mm_loadu_ps(p);
720 #elif defined(NLIB_NEON)
721  return vld1q_f32(p);
722 #elif defined(CAFE)
723  f128 ret;
724  ret.vec.ps[0][0] = p[0];
725  ret.vec.ps[0][1] = p[1];
726  ret.vec.ps[1][0] = p[2];
727  ret.vec.ps[1][1] = p[3];
728  return ret;
729 #endif
730 }
731 
732 // r[i] = p[i], p is 8 bytes aligned
733 NLIB_M(f128) F128::LoadA8(const float* p) NLIB_NOEXCEPT {
734 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
735  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
736  uint64x2_t val = vld1q_u64(tmp);
737  return vreinterpretq_f32_u64(val);
738 #else
739  return LoadA4(p);
740 #endif
741 }
742 
743 // r[i] = p[i], p is 16 bytes aligned
744 NLIB_M(f128) F128::LoadA16(uintptr_t p) NLIB_NOEXCEPT {
745  return LoadA16(reinterpret_cast<const float*>(p));
746 }
747 
748 // r[i] = p[i], p is 8 bytes aligned
749 NLIB_M(f128) F128::LoadA8(uintptr_t p) NLIB_NOEXCEPT {
750  return LoadA8(reinterpret_cast<const float*>(p));
751 }
752 
753 // r[i] = p[i], p is 4 bytes aligned
754 NLIB_M(f128) F128::LoadA4(uintptr_t p) NLIB_NOEXCEPT {
755  return LoadA4(reinterpret_cast<const float*>(p));
756 }
757 
758 // r[i] = p[i], p is 16 bytes aligned
759 NLIB_M(f128) F128::LoadA16(intptr_t p) NLIB_NOEXCEPT {
760  return LoadA16(reinterpret_cast<const float*>(p));
761 }
762 
763 // r[i] = p[i], p is 8 bytes aligned
764 NLIB_M(f128) F128::LoadA8(intptr_t p) NLIB_NOEXCEPT {
765  return LoadA8(reinterpret_cast<const float*>(p));
766 }
767 
768 // r[i] = p[i], p is 4 bytes aligned
769 NLIB_M(f128) F128::LoadA4(intptr_t p) NLIB_NOEXCEPT {
770  return LoadA4(reinterpret_cast<const float*>(p));
771 }
772 
773 // p[i] = value[i], p is 16 bytes aligned
774 NLIB_M(void) F128::StoreA16(float* p, f128arg value) NLIB_NOEXCEPT {
775 #ifdef NLIB_F128_SIMD_NOUSE
776  p[0] = value.vec.v[0];
777  p[1] = value.vec.v[1];
778  p[2] = value.vec.v[2];
779  p[3] = value.vec.v[3];
780 #elif defined(NLIB_SSE41)
781  _mm_store_ps(p, value);
782 #elif defined(NLIB_NEON)
783  uint64x2_t tmp = vreinterpretq_u64_f32(value);
784  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
785 #elif defined(CAFE)
786  p[0] = value.vec.ps[0][0];
787  p[1] = value.vec.ps[0][1];
788  p[2] = value.vec.ps[1][0];
789  p[3] = value.vec.ps[1][1];
790 #endif
791 }
792 
793 // p[i] = value[i], p is 4 bytes aligned
794 NLIB_M(void) F128::StoreA4(float* p, f128arg value) NLIB_NOEXCEPT {
795 #ifdef NLIB_F128_SIMD_NOUSE
796  StoreA16(p, value);
797 #elif defined(NLIB_SSE41)
798  _mm_storeu_ps(p, value);
799 #elif defined(NLIB_NEON)
800  vst1q_f32(p, value);
801 #elif defined(CAFE)
802  p[0] = value.vec.ps[0][0];
803  p[1] = value.vec.ps[0][1];
804  p[2] = value.vec.ps[1][0];
805  p[3] = value.vec.ps[1][1];
806 #endif
807 }
808 
809 // p[i] = value[i], p is 8 bytes aligned
810 NLIB_M(void) F128::StoreA8(float* p, f128arg value) NLIB_NOEXCEPT {
811 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
812  uint64x2_t tmp = vreinterpretq_u64_f32(value);
813  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
814 #else
815  StoreA4(p, value);
816 #endif
817 }
818 
819 // p[i] = value[i], p is 16 bytes aligned
820 NLIB_M(void) F128::StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
821  StoreA16(reinterpret_cast<float*>(p), value);
822 }
823 
824 // p[i] = value[i], p is 8 bytes aligned
825 NLIB_M(void) F128::StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
826  StoreA8(reinterpret_cast<float*>(p), value);
827 }
828 
829 // p[i] = value[i], p is 4 bytes aligned
830 NLIB_M(void) F128::StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
831  StoreA4(reinterpret_cast<float*>(p), value);
832 }
833 
834 // p[i] = value[i], p is 16 bytes aligned
835 NLIB_M(void) F128::StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT {
836  StoreA16(reinterpret_cast<float*>(p), value);
837 }
838 
839 // p[i] = value[i], p is 8 bytes aligned
840 NLIB_M(void) F128::StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
841  StoreA8(reinterpret_cast<float*>(p), value);
842 }
843 
844 // p[i] = value[i], p is 4 bytes aligned
845 NLIB_M(void) F128::StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
846  StoreA4(reinterpret_cast<float*>(p), value);
847 }
848 
849 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
850 NLIB_M(void) F128::StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT {
851 #ifdef NLIB_F128_SIMD_NOUSE
852  p[0] = value.vec.v[0];
853  p[1] = value.vec.v[1];
854 #elif defined(NLIB_SSE41)
855  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
856 #elif defined(NLIB_NEON)
857  uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
858  vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
859 #elif defined(CAFE)
860  p[0] = value.vec.ps[0][0];
861  p[1] = value.vec.ps[0][1];
862 #endif
863 }
864 
865 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
866 NLIB_M(void) F128::StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT {
867 #ifdef NLIB_F128_SIMD_NOUSE
868  p[0] = value.vec.v[0];
869  p[1] = value.vec.v[1];
870 #elif defined(NLIB_SSE41)
871  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
872 #elif defined(NLIB_NEON)
873  float32x2_t tmp = vget_low_f32(value);
874  vst1_f32(p, tmp);
875 #elif defined(CAFE)
876  p[0] = value.vec.ps[0][0];
877  p[1] = value.vec.ps[0][1];
878 #endif
879 }
880 
881 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
882 NLIB_M(void) F128::StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
883  StoreLoA8(reinterpret_cast<float*>(p), value);
884 }
885 
886 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
887 NLIB_M(void) F128::StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
888  StoreLoA4(reinterpret_cast<float*>(p), value);
889 }
890 
891 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
892 NLIB_M(void) F128::StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
893  StoreLoA8(reinterpret_cast<float*>(p), value);
894 }
895 
896 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
897 NLIB_M(void) F128::StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
898  StoreLoA4(reinterpret_cast<float*>(p), value);
899 }
900 
901 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
902 NLIB_M(void) F128::StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT {
903 #ifdef NLIB_F128_SIMD_NOUSE
904  p[0] = value.vec.v[2];
905  p[1] = value.vec.v[3];
906 #elif defined(NLIB_SSE41)
907  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
908 #elif defined(NLIB_NEON)
909  vst1_f32(p, vget_high_f32(value));
910 #elif defined(CAFE)
911  p[0] = value.vec.ps[1][0];
912  p[1] = value.vec.ps[1][1];
913 #endif
914 }
915 
916 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
917 NLIB_M(void) F128::StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT {
918 #ifdef NLIB_F128_SIMD_NOUSE
919  p[0] = value.vec.v[2];
920  p[1] = value.vec.v[3];
921 #elif defined(NLIB_SSE41)
922  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
923 #elif defined(NLIB_NEON)
924  float32x2_t tmp = vget_high_f32(value);
925  vst1_f32(p, tmp);
926 #elif defined(CAFE)
927  p[0] = value.vec.ps[1][0];
928  p[1] = value.vec.ps[1][1];
929 #endif
930 }
931 
932 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
933 NLIB_M(void) F128::StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
934  StoreHiA8(reinterpret_cast<float*>(p), value);
935 }
936 
937 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
938 NLIB_M(void) F128::StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
939  StoreHiA4(reinterpret_cast<float*>(p), value);
940 }
941 
942 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
943 NLIB_M(void) F128::StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
944  StoreHiA8(reinterpret_cast<float*>(p), value);
945 }
946 
947 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
948 NLIB_M(void) F128::StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
949  StoreHiA4(reinterpret_cast<float*>(p), value);
950 }
951 
952 // r[i] = fabs(value[i])
953 NLIB_M(f128) F128::Abs(f128arg value) NLIB_NOEXCEPT {
954 #ifdef NLIB_F128_SIMD_NOUSE
955  f128 ret;
956  ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
957  ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
958  ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
959  ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
960  return ret;
961 #elif defined(NLIB_NEON)
962  return vabsq_f32(value);
963 #elif defined(NLIB_SSE41)
964  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
965  return _mm_andnot_ps(signmask, value);
966 #elif defined(CAFE)
967  f128 ret;
968  ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
969  ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
970  return ret;
971 #endif
972 }
973 
974 // r[i] = mask[i] ? a[i] : b[i]
975 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT {
976 #ifdef NLIB_F128_SIMD_NOUSE
977  f128 result;
978  result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
979  result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
980  result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
981  result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
982  return result;
983 #elif defined(NLIB_SSE41)
984  return _mm_blendv_ps(b, a, mask);
985 #elif defined(NLIB_NEON)
986  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
987 #elif defined(CAFE)
988  // avoid NaN
989  f128 mask_ = mask;
990  mask_.vec.u[0] &= 0xFF7FFFFFUL;
991  mask_.vec.u[1] &= 0xFF7FFFFFUL;
992  mask_.vec.u[2] &= 0xFF7FFFFFUL;
993  mask_.vec.u[3] &= 0xFF7FFFFFUL;
994  // mask_ < 0 ? a : b
995  f128 ret;
996  ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
997  ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
998  return ret;
999 #endif
1000 }
1001 
1002 /*
1003 NLIB_M(f128) F128::ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT {
1004 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1005  float16x4_t lo = vget_low_f16(vreinterpretq_f16_f32(value));
1006  return vcvt_f32_f16(lo);
1007 #else
1008  // _mm_cvtph_ps
1009  (void)value;
1010  return F128::SetZero();
1011 #endif
1012 }
1013 
1014 NLIB_M(f128) F128::ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT {
1015 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1016 #ifdef __aarch64__
1017  return vcvt_high_f32_f16(value);
1018 #else
1019  float16x4_t hi = vget_high_f16(vreinterpretq_f16_f32(value));
1020  return vcvt_f32_f16(hi);
1021 #endif
1022 #else
1023  // _mm_cvtph_ps
1024  (void)value;
1025  return F128::SetZero();
1026 #endif
1027 }
1028 
1029 NLIB_M(f128) F128::ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT {
1030 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1031 #ifdef __aarch64__
1032  float16x4_t lo = vcvt_f16_f32(a);
1033  float16x8_t x = vcvt_high_f16_f32(lo, b);
1034  return vreinterpretq_f32_f16(x);
1035 #else
1036  float16x4_t lo = vcvt_f16_f32(a);
1037  float16x4_t hi = vcvt_f16_f32(b);
1038  return vreinterpretq_f32_f16(vcombine_f16(lo, hi));
1039 #endif
1040 #else
1041  // _mm_cvtps_ph
1042  (void)a;
1043  (void)b;
1044  return F128::SetZero();
1045 #endif
1046 }
1047 */
1048 
1049 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
1050 // r[i] = static_cast<float>(value[i])
1051 NLIB_M(f128) F128::ConvertFromI128(i128 value) NLIB_NOEXCEPT {
1052 #if defined(NLIB_SSE41)
1053  return _mm_cvtepi32_ps(value);
1054 #elif defined(NLIB_NEON)
1055  return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1056 #endif
1057 }
1058 
1059 // r[i] = *reinterpret_cast<float*>(&value[i])
1060 NLIB_M(f128) F128::CastFromI128(i128 value) NLIB_NOEXCEPT {
1061 #if defined(NLIB_SSE41)
1062  return _mm_castsi128_ps(value);
1063 #elif defined(NLIB_NEON)
1064  return vreinterpretq_f32_s8(value);
1065 #endif
1066 }
1067 
1068 // r[i] = static_cast<int>(roundf(value[i]))
1069 NLIB_M(i128) F128::ConvertToI128Round(f128 value) NLIB_NOEXCEPT {
1070 #if defined(NLIB_SSE41)
1071  return _mm_cvtps_epi32(value);
1072 #elif defined(NLIB_NEON)
1073  uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1074  uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1075  uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1076  w = vorrq_u32(w, half);
1077  return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1078 #endif
1079 }
1080 
1081 NLIB_M(i128) F128::ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT {
1082 #if defined(NLIB_SSE41)
1083  return _mm_cvttps_epi32(value);
1084 #elif defined(NLIB_NEON)
1085  return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1086 #endif
1087 }
1088 
1089 // r[i] = *reinterpret_cast<int*>(&value[i])
1090 NLIB_M(i128) F128::CastToI128(f128 value) NLIB_NOEXCEPT {
1091 #if defined(NLIB_SSE41)
1092  return _mm_castps_si128(value);
1093 #elif defined(NLIB_NEON)
1094  return vreinterpretq_s8_f32(value);
1095 #endif
1096 }
1097 
1098 template<int N>
1099 NLIB_M(f128) F128::ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT {
1100  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1101 #if defined(NLIB_NEON)
1102  return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1103 #else
1104  f128 f = F128::ConvertFromI128(value);
1105  f128 m = F128::SetValue(((0x7F - N) << 23), each_uint32); // 0.5f^N
1106  return F128::Mult(f, m);
1107 #endif
1108 }
1109 
1110 template<int N>
1111 NLIB_M(i128) F128::ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT {
1112  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1113 #if defined(NLIB_NEON)
1114  return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1115 #else
1116  f128 m = F128::SetValue(((0x7F + N) << 23), each_uint32); // 2.f^N
1117  f128 f = F128::Mult(value, m);
1118  return F128::ConvertToI128Truncate(f);
1119 #endif
1120 }
1121 
1122 #endif
1123 
1124 // r[i] = (a[i] < b[i]) ? 0xFFFFFFFF : 0
1125 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1126 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1127  f128 ret;
1128  ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1129  ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1130  ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1131  ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1132  return ret;
1133 #elif defined(NLIB_SSE41)
1134  return _mm_cmplt_ps(a, b);
1135 #elif defined(NLIB_NEON)
1136  uint32x4_t tmp = vcltq_f32(a, b);
1137  return vreinterpretq_f32_u32(tmp);
1138 #endif
1139 }
1140 
1141 // r[i] = (a[i] <= b[i]) ? 0xFFFFFFFF : 0
1142 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1143 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1144  f128 ret;
1145  ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1146  ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1147  ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1148  ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1149  return ret;
1150 #elif defined(NLIB_SSE41)
1151  return _mm_cmple_ps(a, b);
1152 #elif defined(NLIB_NEON)
1153  uint32x4_t tmp = vcleq_f32(a, b);
1154  return vreinterpretq_f32_u32(tmp);
1155 #endif
1156 }
1157 
1158 // r[i] = (a[i] > b[i]) ? 0xFFFFFFFF : 0
1159 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1160 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1161  f128 ret;
1162  ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1163  ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1164  ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1165  ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1166  return ret;
1167 #elif defined(NLIB_SSE41)
1168  return _mm_cmpgt_ps(a, b);
1169 #elif defined(NLIB_NEON)
1170  uint32x4_t tmp = vcgtq_f32(a, b);
1171  return vreinterpretq_f32_u32(tmp);
1172 #endif
1173 }
1174 
1175 // r[i] = (a[i] >= b[i]) ? 0xFFFFFFFF : 0
1176 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1177 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1178  f128 ret;
1179  ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1180  ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1181  ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1182  ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1183  return ret;
1184 #elif defined(NLIB_SSE41)
1185  return _mm_cmpge_ps(a, b);
1186 #elif defined(NLIB_NEON)
1187  uint32x4_t tmp = vcgeq_f32(a, b);
1188  return vreinterpretq_f32_u32(tmp);
1189 #endif
1190 }
1191 
1192 // r[i] = (a[i] != b[i]) ? 0xFFFFFFFF : 0
1193 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1194 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1195  f128 ret;
1196  ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1197  ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1198  ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1199  ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1200  return ret;
1201 #elif defined(NLIB_SSE41)
1202  return _mm_cmpneq_ps(a, b);
1203 #elif defined(NLIB_NEON)
1204  uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1205  return vreinterpretq_f32_u32(tmp);
1206 #endif
1207 }
1208 
1209 // r[i] = a[i] + b[i]
1210 NLIB_M(f128) F128::Add(f128arg a, f128arg b) NLIB_NOEXCEPT {
1211 #ifdef NLIB_F128_SIMD_NOUSE
1212  f128 ret;
1213  ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1214  ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1215  ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1216  ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1217  return ret;
1218 #elif defined(NLIB_SSE41)
1219  return _mm_add_ps(a, b);
1220 #elif defined(NLIB_NEON)
1221  return vaddq_f32(a, b);
1222 #elif defined(CAFE)
1223  f128 ret;
1224  ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1225  ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1226  return ret;
1227 #endif
1228 }
1229 
1230 // r[i] = a[i] - b[i]
1231 NLIB_M(f128) F128::Sub(f128arg a, f128arg b) NLIB_NOEXCEPT {
1232 #ifdef NLIB_F128_SIMD_NOUSE
1233  f128 ret;
1234  ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1235  ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1236  ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1237  ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1238  return ret;
1239 #elif defined(NLIB_SSE41)
1240  return _mm_sub_ps(a, b);
1241 #elif defined(NLIB_NEON)
1242  return vsubq_f32(a, b);
1243 #elif defined(CAFE)
1244  f128 ret;
1245  ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1246  ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1247  return ret;
1248 #endif
1249 }
1250 
1251 // r[i] = -value[i]
1252 NLIB_M(f128) F128::Negate(f128arg value) NLIB_NOEXCEPT {
1253 #ifdef NLIB_F128_SIMD_NOUSE
1254  f128 ret;
1255  ret.vec.v[0] = -value.vec.v[0];
1256  ret.vec.v[1] = -value.vec.v[1];
1257  ret.vec.v[2] = -value.vec.v[2];
1258  ret.vec.v[3] = -value.vec.v[3];
1259  return ret;
1260 #elif defined(NLIB_NEON)
1261  return vnegq_f32(value);
1262 #elif defined(NLIB_SSE41)
1263  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
1264  return _mm_xor_ps(signmask, value);
1265 #elif defined(CAFE)
1266  f128 ret;
1267  ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1268  ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1269  return ret;
1270 #endif
1271 }
1272 
1273 // r[i] = a[i] * b[i]
1274 NLIB_M(f128) F128::Mult(f128arg a, f128arg b) NLIB_NOEXCEPT {
1275 #ifdef NLIB_F128_SIMD_NOUSE
1276  f128 ret;
1277  ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1278  ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1279  ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1280  ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1281  return ret;
1282 #elif defined(NLIB_SSE41)
1283  return _mm_mul_ps(a, b);
1284 #elif defined(NLIB_NEON)
1285  return vmulq_f32(a, b);
1286 #elif defined(CAFE)
1287  f128 ret;
1288  ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1289  ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1290  return ret;
1291 #endif
1292 }
1293 
1294 // r[i] = a * b[i]
1295 NLIB_M(f128) F128::Mult(float a, f128arg b) NLIB_NOEXCEPT {
1296 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1297  return vmulq_n_f32(b, a);
1298 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1299  f128 ret;
1300  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1301  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1302  return ret;
1303 #else
1304  return F128::Mult(b, F128::SetValue(a, each_float));
1305 #endif
1306 }
1307 
1308 template <size_t N>
1309 // r[i] = a[N] * b[i]
1310 NLIB_M(f128) F128::Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT {
1311 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1312 # if __aarch64__
1313  return vmulq_laneq_f32(b, a, N);
1314 # else
1315  float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1316  return vmulq_n_f32(b, tmp);
1317 # endif
1318 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1319  float t = a.vec.ps[N / 2][N % 2];
1320  f128 ret;
1321  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1322  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1323  return ret;
1324 #else
1325  return F128::Mult(F128::SetValue<N>(a, each_select32), b);
1326 #endif
1327 }
1328 
1329 // r[i] = a[i] / b[i]
1330 NLIB_M(f128) F128::Div(f128arg a, f128arg b) NLIB_NOEXCEPT {
1331 #ifdef NLIB_F128_SIMD_NOUSE
1332  f128 ret;
1333  ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1334  ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1335  ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1336  ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1337  return ret;
1338 #elif defined(NLIB_SSE41)
1339  return _mm_div_ps(a, b);
1340 #elif defined(NLIB_NEON)
1341 # ifdef __aarch64__
1342  return vdivq_f32(a, b);
1343 # else
1344  float32x4_t inv0 = vrecpeq_f32(b);
1345  float32x4_t step0 = vrecpsq_f32(inv0, b);
1346  float32x4_t inv1 = vmulq_f32(step0, inv0);
1347  float32x4_t step1 = vrecpsq_f32(inv1, b);
1348  float32x4_t inv2 = vmulq_f32(step1, inv1);
1349  uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1350  inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1351  return vmulq_f32(a, inv2);
1352 # endif
1353 #elif defined(CAFE)
1354  f128 ret;
1355  ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1356  ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1357  return ret;
1358 #endif
1359 }
1360 
1361 // r[i] = max(a[i], b[i])
1362 NLIB_M(f128) F128::Max(f128arg a, f128arg b) NLIB_NOEXCEPT {
1363 #ifdef NLIB_F128_SIMD_NOUSE
1364  f128 ret;
1365  ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1366  ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1367  ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1368  ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1369  return ret;
1370 #elif defined(NLIB_SSE41)
1371  return _mm_max_ps(a, b);
1372 #elif defined(NLIB_NEON)
1373  return vmaxq_f32(a, b);
1374 #elif defined(CAFE)
1375  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1376  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1377  f128 ret;
1378  ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1379  ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1380  return ret;
1381 #endif
1382 }
1383 
1384 // r[i] = min(a[i], b[i])
1385 NLIB_M(f128) F128::Min(f128arg a, f128arg b) NLIB_NOEXCEPT {
1386 #ifdef NLIB_F128_SIMD_NOUSE
1387  f128 ret;
1388  ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1389  ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1390  ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1391  ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1392  return ret;
1393 #elif defined(NLIB_SSE41)
1394  return _mm_min_ps(a, b);
1395 #elif defined(NLIB_NEON)
1396  return vminq_f32(a, b);
1397 #elif defined(CAFE)
1398  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1399  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1400  f128 ret;
1401  ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1402  ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1403  return ret;
1404 #endif
1405 }
1406 
1407 // r[0] = max(a[0], a[1]), r[1] = max(a[2], a[3]), r[2] = max(b[0], b[1]), r[3] = max(b[2], b[3])
1408 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT {
1409 #ifdef NLIB_F128_SIMD_NOUSE
1410  f128 ret;
1411  ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1412  ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1413  ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1414  ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1415  return ret;
1416 #elif defined(NLIB_SSE41)
1417  f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1418  f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1419  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1420 #elif defined(NLIB_NEON)
1421 # ifdef __aarch64__
1422  return vpmaxq_f32(a, b);
1423 # else
1424  float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1425  float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1426  return vcombine_f32(rl, rh);
1427 # endif
1428 #elif defined(CAFE)
1429  f32x2 v02, v13, cmp;
1430  f128 ret;
1431  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1432  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1433  cmp = __PS_SUB(v02, v13);
1434  ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1435  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1436  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1437  cmp = __PS_SUB(v02, v13);
1438  ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1439  return ret;
1440 #endif
1441 }
1442 
1443 // r[0] = min(a[0], a[1]), r[1] = min(a[2], a[3]), r[2] = min(b[0], b[1]), r[3] = min(b[2], b[3])
1444 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT {
1445 #ifdef NLIB_F128_SIMD_NOUSE
1446  f128 ret;
1447  ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1448  ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1449  ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1450  ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1451  return ret;
1452 #elif defined(NLIB_SSE41)
1453  f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1454  f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1455  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1456 #elif defined(NLIB_NEON)
1457 # ifdef __aarch64__
1458  return vpminq_f32(a, b);
1459 # else
1460  float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1461  float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1462  return vcombine_f32(rl, rh);
1463 # endif
1464 #elif defined(CAFE)
1465  f32x2 v02, v13, cmp;
1466  f128 ret;
1467  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1468  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1469  cmp = __PS_SUB(v02, v13);
1470  ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1471  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1472  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1473  cmp = __PS_SUB(v02, v13);
1474  ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1475  return ret;
1476 #endif
1477 }
1478 
1479 // r[0] = a[0] + a[1], r[1] = a[2] + a[3], ...
1480 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT {
1481 #ifdef NLIB_F128_SIMD_NOUSE
1482  f128 ret;
1483  ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1484  ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1485  ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1486  ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1487  return ret;
1488 #elif defined(NLIB_SSE41)
1489  return _mm_hadd_ps(a, b);
1490 #elif defined(NLIB_NEON)
1491 # ifdef __aarch64__
1492  return vpaddq_f32(a, b);
1493 # else
1494  float32x2_t al = vget_low_f32(a);
1495  float32x2_t ah = vget_high_f32(a);
1496  float32x2_t l = vpadd_f32(al, ah);
1497 
1498  float32x2_t bl = vget_low_f32(b);
1499  float32x2_t bh = vget_high_f32(b);
1500  float32x2_t h = vpadd_f32(bl, bh);
1501  return vcombine_f32(l, h);
1502 # endif
1503 #elif defined(CAFE)
1504  f32x2 v02, v13, cmp;
1505  f128 ret;
1506  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1507  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1508  ret.vec.ps[0] = __PS_ADD(v02, v13);
1509  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1510  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1511  ret.vec.ps[1] = __PS_ADD(v02, v13);
1512  return ret;
1513 #endif
1514 }
1515 
1516 // r[i] = fabs(a[i] - b[i])
1517 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT {
1518 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1519  return vabdq_f32(a, b);
1520 #else
1521  return F128::Abs(F128::Sub(a, b));
1522 #endif
1523 }
1524 
1525 // r[i] = c[i] + a[i] * b[i]
1526 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1527 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1528 # if __aarch64__
1529  return vfmaq_f32(c, a, b);
1530 # else
1531  return vmlaq_f32(c, a, b);
1532 # endif
1533 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1534  f128 ret;
1535  ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1536  ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1537  return ret;
1538 #else
1539  return F128::Add(c, F128::Mult(a, b));
1540 #endif
1541 }
1542 
1543 // r[i] = c[i] + a * b[i]
1544 NLIB_M(f128) F128::MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1545 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1546 # if __aarch64__
1547  return vfmaq_n_f32(c, b, a);
1548 # else
1549  return vmlaq_n_f32(c, b, a);
1550 # endif
1551 #else
1552  return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1553 #endif
1554 }
1555 
1556 template <size_t N>
1557 // r[i] = c[i] + a[N] * b[i]
1558 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1560  NLIB_STATIC_ASSERT(N < 4);
1561 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1562 # if __aarch64__
1563  return vfmaq_laneq_f32(c, b, a, N);
1564 # else
1565  return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1566 # endif
1567 #else
1568  return F128::MultAdd(F128::SetValue<N>(a, each_select32), b, c);
1569 #endif
1570 }
1571 
1572 // r[i] = c[i] - a[i] * b[i]
1573 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1574 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1575 # if __aarch64__
1576  return vfmsq_f32(c, a, b);
1577 # else
1578  return vmlsq_f32(c, a, b);
1579 # endif
1580 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1581  f128 ret;
1582  ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1583  ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1584  return ret;
1585 #else
1586  return F128::Sub(c, F128::Mult(a, b));
1587 #endif
1588 }
1589 
1590 // r[i] = c[i] - a * b[i]
1591 NLIB_M(f128) F128::MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1592 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1593 # if __aarch64__
1594  return vfmsq_n_f32(c, b, a);
1595 # else
1596  return vmlsq_n_f32(c, b, a);
1597 # endif
1598 #else
1599  return F128::MultSub(F128::SetValue(a, each_float), b, c);
1600 #endif
1601 }
1602 
1603 template <size_t N>
1604 // r[i] = c[i] - a[N] * b[i]
1605 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1607  NLIB_STATIC_ASSERT(N < 4);
1608 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1609 # ifdef __arch64__
1610  return vfmsq_laneq_f32(c, b, a, N);
1611 # else
1612  return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1613 # endif
1614 #else
1615  return F128::MultSub(F128::SetValue<N>(a, each_select32), b, c);
1616 #endif
1617 }
1618 
1619 // r[i] = a[i] + t[i] * (b[i] - a[i])
1620 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT {
1621  // a + t * (b - a)
1622  return F128::MultAdd(t, F128::Sub(b, a), a);
1623 }
1624 
1625 // r[i] = a[i] & b[i]
1626 NLIB_M(f128) F128::And(f128arg a, f128arg b) NLIB_NOEXCEPT {
1627 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1628  f128 ret;
1629  ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1630  ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1631  ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1632  ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1633  return ret;
1634 #elif defined(NLIB_SSE41)
1635  return _mm_and_ps(a, b);
1636 #elif defined(NLIB_NEON)
1637  uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1638  return vreinterpretq_f32_u32(tmp);
1639 #endif
1640 }
1641 
1642 // -pi <= angle1, angle2, r < pi
1643 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1644  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1645  // -pi <= ret < pi
1646  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1647  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1648 
1649  f128 sum = F128::Add(angle1, angle2);
1650  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1651  f128 ofs = F128::And(cond, pi_dbl);
1652  f128 result = F128::Add(sum, ofs);
1653  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1654  ofs = F128::And(cond, pi_dbl);
1655  return F128::Sub(result, ofs);
1656 }
1657 
1658 // -pi <= angle1, angle2, r < pi
1659 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1660  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1661  // -pi <= ret < pi
1662  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1663  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1664 
1665  f128 sum = F128::Sub(angle1, angle2);
1666  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1667  f128 ofs = F128::And(cond, pi_dbl);
1668  f128 result = F128::Add(sum, ofs);
1669  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1670  ofs = F128::And(cond, pi_dbl);
1671  return F128::Sub(result, ofs);
1672 }
1673 
1674 // ( 2 1 -2 1) (p0)
1675 // (t^3 t^2 t 1) (-3 -2 3 -1) (v0)
1676 // ( 0 1 0 0) (p1)
1677 // ( 1 0 0 0) (v1)
1678 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1679  f128arg_ex t) NLIB_NOEXCEPT {
1680  // (2 * p0 + v0 - 2 * p1 + v1) * t^3 + (-3 * p0 - 2 * v0 + 3 * p1 - v1) * t^2
1681  // + v0 * t + p0
1682  // ==
1683  // (2 * t^3 - 3 * t^2 + 1) * p0 + (t^3 - 2 * t^2 + t) * v0
1684  // + (-2 * t^3 + 3 * t^2) * p1 + (t^3 - t^2) * v1
1685  f128 tt = F128::Mult(t, t);
1686  f128 ttt = F128::Mult(tt, t);
1687 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1688  f128 hermite_R0 = vcombine_f32(vcreate_f32(0x3F80000040000000ULL),
1689  vcreate_f32(0x3F800000C0000000ULL));
1690  f128 hermite_R1 = vcombine_f32(vcreate_f32(0xC0000000C0400000ULL),
1691  vcreate_f32(0xBF80000040400000ULL));
1692 #else
1693  f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1694  f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1695 #endif
1696 
1697  ttt = F128::Mult(ttt, hermite_R0);
1698  ttt = F128::MultAdd(tt, hermite_R1, ttt);
1699  ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1700  ttt = F128::Add(ttt, F128::Set1000());
1701 
1702  // vec(ttt) * mtx(p0, v0, p1, v1)
1703  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1704  result = F128::MultAdd<1>(ttt, v0, result, each_select32);
1705  result = F128::MultAdd<2>(ttt, p1, result, each_select32);
1706  result = F128::MultAdd<3>(ttt, v1, result, each_select32);
1707  return result;
1708 }
1709 
1710 // (-1 3 -3 1) (p0)
1711 // 0.5 * (t^3 t^2 t 1) ( 2 -5 4 -1) (p1)
1712 // (-1 0 1 0) (p2)
1713 // ( 0 2 0 0) (p3)
1714 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1715  f128arg_ex t) NLIB_NOEXCEPT {
1716  f128 tt = F128::Mult(t, t);
1717  f128 ttt = F128::Mult(tt, t);
1718 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1719  f128 catmull_R0 = vcombine_f32(vcreate_f32(0x40400000BF800000ULL),
1720  vcreate_f32(0x3F800000C0400000ULL));
1721  f128 catmull_R1 = vcombine_f32(vcreate_f32(0xC0A0000040000000ULL),
1722  vcreate_f32(0xBF80000040800000ULL));
1723  f128 catmull_R2 = vcombine_f32(vcreate_f32(0x00000000BF800000ULL),
1724  vcreate_f32(0x000000003F800000ULL));
1725 #else
1726  f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1727  f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1728  f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1729 #endif
1730  ttt = F128::Mult(ttt, catmull_R0);
1731  ttt = F128::MultAdd(tt, catmull_R1, ttt);
1732  ttt = F128::MultAdd(t, catmull_R2, ttt);
1733  ttt = F128::Add(ttt, F128::Set0100());
1734 
1735  // vec(ttt) * mtx(p0, p1, p2, p3)
1736  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1737  result = F128::MultAdd<1>(ttt, p1, result, each_select32);
1738  result = F128::MultAdd<2>(ttt, p2, result, each_select32);
1739  result = F128::MultAdd<3>(ttt, p3, result, each_select32);
1740 
1741  return result;
1742 }
1743 
1744 // p0 + f * (p1 - p0) + g * (p2 - p0)
1745 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1746  f128arg_ex g) NLIB_NOEXCEPT {
1747  f128 p1p0 = F128::Sub(p1, p0);
1748  f128 p2p0 = F128::Sub(p2, p0);
1749  f128 tmp = F128::MultAdd(f, p1p0, p0);
1750  return F128::MultAdd(g, p2p0, tmp);
1751 }
1752 
1753 // r[i] = a[i] | b[i]
1754 NLIB_M(f128) F128::Or(f128arg a, f128arg b) NLIB_NOEXCEPT {
1755 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1756  f128 ret;
1757  ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1758  ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1759  ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1760  ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1761  return ret;
1762 #elif defined(NLIB_SSE41)
1763  return _mm_or_ps(a, b);
1764 #elif defined(NLIB_NEON)
1765  uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1766  return vreinterpretq_f32_u32(tmp);
1767 #endif
1768 }
1769 
1770 // r[i] = a[i] ^ b[i]
1771 NLIB_M(f128) F128::Xor(f128arg a, f128arg b) NLIB_NOEXCEPT {
1772 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1773  f128 ret;
1774  ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1775  ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1776  ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1777  ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1778  return ret;
1779 #elif defined(NLIB_SSE41)
1780  return _mm_xor_ps(a, b);
1781 #elif defined(NLIB_NEON)
1782  uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1783  return vreinterpretq_f32_u32(tmp);
1784 #endif
1785 }
1786 
1787 // r[i] = ~a[i]
1788 NLIB_M(f128) F128::Not(f128arg a) NLIB_NOEXCEPT {
1789 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1790  f128 ret;
1791  ret.vec.u[0] = ~a.vec.u[0];
1792  ret.vec.u[1] = ~a.vec.u[1];
1793  ret.vec.u[2] = ~a.vec.u[2];
1794  ret.vec.u[3] = ~a.vec.u[3];
1795  return ret;
1796 #elif defined(NLIB_SSE41)
1797  return _mm_andnot_ps(a, F128::CmpEq(a, a));
1798 #elif defined(NLIB_NEON)
1799  uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1800  return vreinterpretq_f32_u32(tmp);
1801 #endif
1802 }
1803 
1804 // r[i] = ~a[i] & b[i]
1805 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1806 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1807  f128 ret;
1808  ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1809  ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1810  ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1811  ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1812  return ret;
1813 #elif defined(NLIB_SSE41)
1814  return _mm_andnot_ps(a, b);
1815 #elif defined(NLIB_NEON)
1816  uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1817  return vreinterpretq_f32_u32(tmp);
1818 #endif
1819 }
1820 
1821 // r[i] = ~a[i] | b[i]
1822 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1823 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1824  f128 ret;
1825  ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1826  ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1827  ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1828  ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1829  return ret;
1830 #elif defined(NLIB_SSE41)
1831  return _mm_or_ps(F128::Not(a), b);
1832 #elif defined(NLIB_NEON)
1833  uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1834  return vreinterpretq_f32_u32(tmp);
1835 #endif
1836 }
1837 
1838 // r[i] = (a[i] == b[i]) ? 0xFFFFFFFF : 0
1839 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT {
1840 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1841  f128 ret;
1842  ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1843  ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1844  ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1845  ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1846  return ret;
1847 #elif defined(NLIB_SSE41)
1848  return _mm_cmpeq_ps(a, b);
1849 #elif defined(NLIB_NEON)
1850  uint32x4_t tmp = vceqq_f32(a, b);
1851  return vreinterpretq_f32_u32(tmp);
1852 #endif
1853 }
1854 
1855 // r[i] = (absf(a[i] - b[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1856 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT {
1857  f128 tmp = F128::AbsDiff(a, b);
1858  return F128::CmpLe(tmp, eps);
1859 }
1860 
1861 // r[i] = clamp(value[i], min[i], max[i])
1862 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT {
1863  return F128::Min(max, F128::Max(min, value));
1864 }
1865 
1866 // r[i] = absf(value[i]) <= bounds[i] ? 0xFFFFFFFF : 0
1867 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT {
1868 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1869  uint32x4_t tmp = vcaleq_f32(value, bounds);
1870  return vreinterpretq_f32_u32(tmp);
1871 #else
1872  return F128::CmpLe(F128::Abs(value), bounds);
1873 #endif
1874 }
1875 
1876 NLIB_M(f128) F128::CmpEqZero(f128arg value) NLIB_NOEXCEPT {
1877 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1878  return vreinterpretq_f32_u32(vceqzq_f32(value));
1879 #else
1880  return F128::CmpEq(value, F128::SetZero());
1881 #endif
1882 }
1883 
1884 NLIB_M(f128) F128::CmpLtZero(f128arg value) NLIB_NOEXCEPT {
1885 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1886  return vreinterpretq_f32_u32(vcltzq_f32(value));
1887 #else
1888  return F128::CmpLt(value, F128::SetZero());
1889 #endif
1890 }
1891 
1892 NLIB_M(f128) F128::CmpLeZero(f128arg value) NLIB_NOEXCEPT {
1893 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1894  return vreinterpretq_f32_u32(vclezq_f32(value));
1895 #else
1896  return F128::CmpLe(value, F128::SetZero());
1897 #endif
1898 }
1899 
1900 NLIB_M(f128) F128::CmpGtZero(f128arg value) NLIB_NOEXCEPT {
1901 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1902  return vreinterpretq_f32_u32(vcgtzq_f32(value));
1903 #else
1904  return F128::CmpGt(value, F128::SetZero());
1905 #endif
1906 }
1907 
1908 NLIB_M(f128) F128::CmpGeZero(f128arg value) NLIB_NOEXCEPT {
1909 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1910  return vreinterpretq_f32_u32(vcgezq_f32(value));
1911 #else
1912  return F128::CmpGe(value, F128::SetZero());
1913 #endif
1914 }
1915 
1916 NLIB_M(f128) F128::CmpNeZero(f128arg value) NLIB_NOEXCEPT {
1917 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1918  return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1919 #else
1920  return F128::CmpNe(value, F128::SetZero());
1921 #endif
1922 }
1923 
1924 // r[i] = (absf(value[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1925 NLIB_M(f128) F128::CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT {
1926  f128 tmp = F128::Abs(value);
1927  return F128::CmpLe(tmp, eps);
1928 }
1929 
1930 // r[i] = 1.f / value[i] with higher precision, set infinity if value[i] == 0
1931 NLIB_M2(f128) F128::Recp(f128arg value) NLIB_NOEXCEPT {
1932 #ifdef NLIB_F128_SIMD_NOUSE
1933  f128 ret;
1934  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1935  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1936  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1937  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1938  return ret;
1939 #elif defined(NLIB_SSE41)
1940  return _mm_div_ps(F128::SetOne(), value);
1941 #elif defined(NLIB_NEON)
1942 #ifdef __aarch64__
1943  return vdivq_f32(vdupq_n_f32(1.f), value);
1944 #else
1945  float32x4_t x;
1946  x = vrecpeq_f32(value);
1947  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x1 = x0 * (2 - x0 * value)
1948  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x2 = x1 * (2 - x1 * value)
1949  uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1950  float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1951  return result;
1952 #endif
1953 #elif defined(CAFE)
1954  return F128::Div(F128::SetOne(), value);
1955 #endif
1956 }
1957 
1958 // r[i] = 1.f / value[i] with lower precision
1959 NLIB_M(f128) F128::RecpEst(f128arg value) NLIB_NOEXCEPT {
1960 #ifdef NLIB_F128_SIMD_NOUSE
1961  f128 ret;
1962  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1963  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1964  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1965  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1966  return ret;
1967 #elif defined(NLIB_SSE41)
1968  return _mm_rcp_ps(value);
1969 #elif defined(NLIB_NEON)
1970  return vrecpeq_f32(value);
1971 #elif defined(CAFE)
1972  f128 ret;
1973  ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1974  ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1975  return ret;
1976 #endif
1977 }
1978 
1979 // r[i] = sqrtf(value[i]) with higher precision
1980 NLIB_M2(f128) F128::Sqrt(f128arg value) NLIB_NOEXCEPT {
1981 #ifdef NLIB_F128_SIMD_NOUSE
1982  f128 ret;
1983  ret.vec.v[0] = sqrtf(value.vec.v[0]);
1984  ret.vec.v[1] = sqrtf(value.vec.v[1]);
1985  ret.vec.v[2] = sqrtf(value.vec.v[2]);
1986  ret.vec.v[3] = sqrtf(value.vec.v[3]);
1987  return ret;
1988 #elif defined(NLIB_SSE41)
1989  return _mm_sqrt_ps(value);
1990 #elif defined(NLIB_NEON)
1991  f128 iszero = F128::CmpEqZero(value);
1992  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1993  return F128::AndNot(iszero, result);
1994 #elif defined(CAFE)
1995  f128 zero = F128::SetZero();
1996  f128 iszero = F128::CmpEq(zero, value);
1997  f128 result = F128::Mult(value, F128::RecpSqrt(value));
1998  return F128::Select(iszero, zero, result);
1999 #endif
2000 }
2001 
2002 // r[i] = sqrtf(value[i]) with lower precision
2003 NLIB_M(f128) F128::SqrtEst(f128arg value) NLIB_NOEXCEPT {
2004 #ifdef NLIB_F128_SIMD_NOUSE
2005  f128 ret;
2006  ret.vec.v[0] = sqrtf(value.vec.v[0]);
2007  ret.vec.v[1] = sqrtf(value.vec.v[1]);
2008  ret.vec.v[2] = sqrtf(value.vec.v[2]);
2009  ret.vec.v[3] = sqrtf(value.vec.v[3]);
2010  return ret;
2011 #elif defined(NLIB_SSE41)
2012  return _mm_sqrt_ps(value);
2013 #elif defined(NLIB_NEON)
2014  return vrecpeq_f32(vrsqrteq_f32(value));
2015 #elif defined(CAFE)
2016  f128 ret;
2017  ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2018  ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2019  return ret;
2020 #endif
2021 }
2022 
2023 // r[i] = sqrtf(1.f / value[i]) with higher precision
2024 NLIB_M2(f128) F128::RecpSqrt(f128arg value) NLIB_NOEXCEPT {
2025 #ifdef NLIB_F128_SIMD_NOUSE
2026  f128 ret;
2027  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2028  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2029  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2030  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2031  return ret;
2032 #elif defined(NLIB_SSE41)
2033  return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2034 #elif defined(NLIB_NEON)
2035  float32x4_t x;
2036  x = vrsqrteq_f32(value);
2037  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2038  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2039  f128 zeromask = F128::CmpEqZero(value);
2040  return F128::Select(zeromask, F128::SetInfinity(), x);
2041 #elif defined(CAFE)
2042  f32x2 three = __PS_FDUP(3.f);
2043  f32x2 half = __PS_FDUP(0.5f);
2044  f32x2 x;
2045  f32x2 xx;
2046  f32x2 v;
2047  f128 ret;
2048 
2049  v = value.vec.ps[0];
2050  x = __PS_RSQRTE(v);
2051 
2052  xx = __PS_MUL(x, x);
2053  xx = __PS_NMSUB(v, xx, three);
2054  xx = __PS_MUL(x, xx);
2055  x = __PS_MUL(half, xx);
2056 
2057  xx = __PS_MUL(x, x);
2058  xx = __PS_NMSUB(v, xx, three);
2059  xx = __PS_MUL(x, xx);
2060  ret.vec.ps[0] = __PS_MUL(half, xx);
2061 
2062  v = value.vec.ps[1];
2063  x = __PS_RSQRTE(v);
2064 
2065  xx = __PS_MUL(x, x);
2066  xx = __PS_NMSUB(v, xx, three);
2067  xx = __PS_MUL(x, xx);
2068  x = __PS_MUL(half, xx);
2069 
2070  xx = __PS_MUL(x, x);
2071  xx = __PS_NMSUB(v, xx, three);
2072  xx = __PS_MUL(x, xx);
2073  ret.vec.ps[1] = __PS_MUL(half, xx);
2074 
2075  f128 iszero = F128::CmpEq(F128::SetZero(), value);
2076  f128 inf = F128::SetInfinity();
2077  return F128::Select(iszero, inf, ret);
2078 #endif
2079 }
2080 
2081 // r[i] = sqrtf(1.f / value[i]) with lower precision
2082 NLIB_M(f128) F128::RecpSqrtEst(f128arg value) NLIB_NOEXCEPT {
2083 #ifdef NLIB_F128_SIMD_NOUSE
2084  f128 ret;
2085  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2086  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2087  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2088  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2089  return ret;
2090 #elif defined(NLIB_SSE41)
2091  return _mm_rsqrt_ps(value);
2092 #elif defined(NLIB_NEON)
2093  return vrsqrteq_f32(value);
2094 #elif defined(CAFE)
2095  f128 ret;
2096  ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2097  ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2098  return ret;
2099 #endif
2100 }
2101 
2102 template <bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
2103 NLIB_M(f128) F128::NegateEx(f128arg value) NLIB_NOEXCEPT {
2104  const size_t lane0 = NegateLane0 ? 4 : 0;
2105  const size_t lane1 = NegateLane1 ? 5 : 1;
2106  const size_t lane2 = NegateLane2 ? 6 : 2;
2107  const size_t lane3 = NegateLane3 ? 7 : 3;
2108  return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2109 }
2110 
2111 template <>
2112 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value) NLIB_NOEXCEPT {
2113  return value;
2114 }
2115 
2116 template <>
2117 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value) NLIB_NOEXCEPT {
2118  return F128::Negate(value);
2119 }
2120 
2121 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2122 #define NLIB_ISNAN(vec, idx) \
2123  ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0)
2124 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U)
2125 #endif
2126 
2127 // r[i] = isnan(value[i]) ? 0xFFFFFFFF : 0
2128 NLIB_M2(f128) F128::IsNaN(f128arg value) NLIB_NOEXCEPT {
2129 #if defined(NLIB_F128_SIMD_NOUSE)
2130  f128 ret;
2131  ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2132  ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2133  ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2134  ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2135  return ret;
2136 #elif defined(CAFE)
2137  // on CAFE, value is NaN if value < 0 && -value < 0
2138  f32x2 one = __PS_FDUP(1.f);
2139  f32x2 minus_one = __PS_NEG(one);
2140  f32x2 v0 = value.vec.ps[0];
2141  f32x2 v1 = value.vec.ps[1];
2142  f32x2 t0 = __PS_SEL(v0, one, minus_one);
2143  f32x2 t1 = __PS_SEL(v1, one, minus_one);
2144  f128 ret;
2145  f32x2 v0neg = __PS_NEG(v0);
2146  f32x2 v1neg = __PS_NEG(v1);
2147  ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2148  ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2149  return ret;
2150 #else
2151  return F128::CmpNe(value, value);
2152 #endif
2153 }
2154 
2155 // r[i] = isinf(value[i]) ? 0xFFFFFFFF : 0
2156 NLIB_M(f128) F128::IsInfinite(f128arg value) NLIB_NOEXCEPT {
2157 #if defined(NLIB_F128_SIMD_NOUSE)
2158  f128 ret;
2159  ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2160  ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2161  ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2162  ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2163  return ret;
2164 #elif defined(CAFE)
2165  f128 ret;
2166  f32x2 big_value = __PS_FDUP(FLT_MAX);
2167  ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2168  ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2169  return ret;
2170 #else
2171  f128 inf_value = F128::SetInfinity();
2172  f128 abs_value = F128::Abs(value);
2173  return F128::CmpEq(inf_value, abs_value);
2174 #endif
2175 }
2176 
2177 // for example, 6.54321 -> 7.0, -6.54321 -> -7.0
2178 NLIB_M(f128) F128::Round(f128arg value) NLIB_NOEXCEPT {
2179 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
2180  return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2181 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE)
2182  return vrndaq_f32(value);
2183 #else
2184  // Add and Sub a big value to round the number after the decimal point
2185  f128 sgn = F128::And(value, F128::SetSignMask());
2186  f128 sm = F128::Or(F128::SetValue(0x4B000000U, each_uint32), sgn);
2187  f128 result = F128::Sub(F128::Add(value, sm), sm);
2188  return result;
2189 #endif
2190 }
2191 
2192 // for example, 6.54321 -> 6.0, -6.54321 -> -6.0
2193 NLIB_M2(f128) F128::Truncate(f128arg value) NLIB_NOEXCEPT {
2194 // Note that there is no fraction if |value| > 2^23
2195 // 2^23 = 8388608
2196 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2197  f128 ret;
2198  for (size_t i = 0; i < 4; ++i) {
2199  if (NLIB_ISNAN(value.vec, i)) {
2200  ret.vec.u[i] = 0x7FC00000U;
2201  } else {
2202  ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2203  ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2204  : value.vec.v[i];
2205  }
2206  }
2207  return ret;
2208 #elif defined(NLIB_SSE41)
2209  return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2210 #elif defined(NLIB_NEON)
2211 # if __ARM_ARCH < 8
2212  f128 x = F128::Abs(value);
2213  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2214  f128 cond = F128::CmpLt(x, c_2_23);
2215  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2216  return F128::Select(cond, casted, value);
2217 # else
2218  return vrndq_f32(value);
2219 # endif
2220 #endif
2221 }
2222 
2223 // for example, 6.54321 -> 6.0, -6.54321 -> -7.0
2224 NLIB_M2(f128) F128::Floor(f128arg value) NLIB_NOEXCEPT {
2225 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2226  f128 ret;
2227  ret.vec.v[0] = floorf(value.vec.v[0]);
2228  ret.vec.v[1] = floorf(value.vec.v[1]);
2229  ret.vec.v[2] = floorf(value.vec.v[2]);
2230  ret.vec.v[3] = floorf(value.vec.v[3]);
2231  return ret;
2232 #elif defined(NLIB_SSE41)
2233  return _mm_floor_ps(value);
2234 #elif defined(NLIB_NEON)
2235 # if __ARM_ARCH < 8
2236  // Note that there is no fraction if |value| > 2^23
2237  // 2^23 = 8388608
2238  f128 x = F128::Abs(value);
2239  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2240  f128 cond = F128::CmpLt(x, c_2_23);
2241  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2242 
2243  // -1 if result is larger
2244  f128 largeMask = F128::CmpGt(casted, value);
2245  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2246  casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(largeMask)));
2247  return F128::Select(cond, casted, value);
2248 # else
2249  return vrndmq_f32(value);
2250 # endif
2251 #endif
2252 }
2253 
2254 // for example, 6.54321 -> 7.0, -6.54321 -> -6.0
2255 NLIB_M2(f128) F128::Ceil(f128arg value) NLIB_NOEXCEPT {
2256 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2257  f128 ret;
2258  ret.vec.v[0] = ceilf(value.vec.v[0]);
2259  ret.vec.v[1] = ceilf(value.vec.v[1]);
2260  ret.vec.v[2] = ceilf(value.vec.v[2]);
2261  ret.vec.v[3] = ceilf(value.vec.v[3]);
2262  return ret;
2263 #elif defined(NLIB_SSE41)
2264  return _mm_ceil_ps(value);
2265 #elif defined(NLIB_NEON)
2266 # if __ARM_ARCH < 8
2267  // Note that there is no fraction if |value| > 2^23
2268  // 2^23 = 8388608
2269  f128 x = F128::Abs(value);
2270  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2271  f128 cond = F128::CmpLt(x, c_2_23);
2272  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2273 
2274  // +1 if result is smaller
2275  f128 smallMask = F128::CmpLt(casted, value);
2276  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2277  casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(smallMask)));
2278  return F128::Select(cond, casted, value);
2279 # else
2280  return vrndpq_f32(value);
2281 # endif
2282 #endif
2283 }
2284 
2285 #ifdef NLIB_F128_SIMD_NOUSE
2286 #undef NLIB_ISNAN
2287 #undef NLIB_ISINF
2288 #endif
2289 
2290 // r[i] = clamp(value[i], { 0, 0, 0, 0 }, { 1, 1, 1, 1 })
2291 NLIB_M(f128) F128::Saturate(f128arg value) NLIB_NOEXCEPT {
2292  return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2293 }
2294 
2295 NLIB_M2(f128) F128::ModAngle(f128arg value) NLIB_NOEXCEPT {
2296  static const float v_1_2pi = 0.15915494309189535f;
2297  static const float v_2pi = 6.283185307179586f;
2298  // value - 2pi * round(value * (1/2pi)) to be [-pi, pi)
2299  const f128 recpTwoPi = F128::SetValue(v_1_2pi, each_float);
2300  f128 round = F128::Round(F128::Mult(value, recpTwoPi));
2301  const f128 twoPi = F128::SetValue(v_2pi, each_float);
2302  return F128::MultSub(twoPi, round, value);
2303 }
2304 
2305 NLIB_M2(f128) F128::Sin(f128arg value) NLIB_NOEXCEPT {
2306  // within [-pi, pi)
2307  f128 x = F128::ModAngle(value);
2308 
2309  // within [-pi/2, pi/2]
2310  // use sin(x) == sin(pi - x), sin(x) == sin(-pi - x)
2311  // |x| <= pi/2 -> x = x
2312  // x > pi/2 -> x = pi - x
2313  // x < -pi/2 -> x = -pi - x
2314  f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2315  f128 pi = F128::SetValue<0>(sin_cvalue, each_select32);
2316  f128 pidiv2 = F128::SetValue<1>(sin_cvalue, each_select32);
2317 
2318  f128 xabs = F128::Abs(value);
2319  f128 xsign = F128::And(F128::SetSignMask(), x);
2320  f128 mypi = F128::Or(xsign, pi);
2321  f128 pi_x = F128::Sub(mypi, x);
2322  f128 cond = F128::CmpLe(xabs, pidiv2);
2323  x = F128::Select(cond, x, pi_x);
2324 
2325  f128 xx = F128::Mult(x, x);
2326  f128 coeff = F128::LoadA16(sin_coeff_);
2327  f128 result;
2328  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2329 
2330  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2331  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2332  result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue, each_select32));
2333  result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue, each_select32));
2334  result = F128::Mult(xx, result);
2335  result = F128::MultSub(result, x, x);
2336  return result;
2337 }
2338 
2339 NLIB_M2(f128) F128::Cos(f128arg value) NLIB_NOEXCEPT {
2340  // within [-pi, pi)
2341  f128 x = F128::ModAngle(value);
2342 
2343  // within [-pi/2, pi/2]
2344  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2345  // |x| <= pi/2 -> x = x
2346  // x > pi/2 -> x = pi - x
2347  // x < -pi/2 -> x = -pi - x
2348  f128 cvalue = F128::LoadA16(cos_cvalue_);
2349 
2350  f128 xabs = F128::Abs(value);
2351  f128 xsign = F128::And(F128::SetSignMask(), x);
2352  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2353  f128 pi_x = F128::Sub(mypi, x);
2354  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2355  x = F128::Select(cond, x, pi_x);
2356 
2357  // +1 if [-pi/2, pi/2], -1 otherwise
2358  f128 sign = F128::AndNot(cond, F128::SetSignMask());
2359 
2360  // xx = x^2
2361  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2362  f128 xx = F128::Mult(x, x);
2363  f128 coeff = F128::LoadA16(cos_coeff_);
2364  f128 result;
2365  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2366 
2367  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2368  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2369  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2370  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2371  result = F128::MultSub(xx, result, F128::SetOne());
2372  result = F128::Xor(sign, result);
2373  return result;
2374 }
2375 
2376 NLIB_M2(f128x2) F128::SinCos(f128arg value) NLIB_NOEXCEPT {
2377  // within [-pi, pi)
2378  const f128 signmask = F128::SetSignMask();
2379  f128 x = F128::ModAngle(value);
2380 
2381  // within [-pi/2, pi/2]
2382  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2383  // |x| <= pi/2 -> x = x
2384  // x > pi/2 -> x = pi - x
2385  // x < -pi/2 -> x = -pi - x
2386  f128 cvalue = F128::LoadA16(cos_cvalue_);
2387 
2388  f128 xabs = F128::Abs(value);
2389  f128 xsign = F128::And(signmask, x);
2390  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2391  f128 pi_x = F128::Sub(mypi, x);
2392  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2393  x = F128::Select(cond, x, pi_x);
2394 
2395  // +1 if [-pi/2, pi/2], -1 otherwise
2396  f128 sign = F128::AndNot(cond, signmask);
2397 
2398  // xx = x^2
2399  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2400  f128 xx = F128::Mult(x, x);
2401  f128x2 ret;
2402 
2403  // cos
2404  {
2405  f128 coeff = F128::LoadA16(cos_coeff_);
2406  f128 result;
2407  result =
2408  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2409 
2410  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2411  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2412  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2413  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2414  result = F128::MultSub(xx, result, F128::SetOne());
2415 
2416  ret.val[1] = F128::Xor(sign, result); // cos
2417  }
2418 
2419  // sin
2420  {
2421  f128 coeff = F128::LoadA16(sin_coeff_);
2422  f128 result;
2423  result =
2424  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2425 
2426  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2427  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2428  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2], each_float));
2429  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3], each_float));
2430  result = F128::Mult(xx, result);
2431  ret.val[0] = F128::MultSub(result, x, x); // sin
2432  }
2433  return ret;
2434 }
2435 
2436 NLIB_M2(f128) F128::ArcTan(f128arg value) NLIB_NOEXCEPT {
2437  // |value| <= 1 -> atan(value)
2438  // value > 1 -> pi/2 - atan(1/value)
2439  // value < -1 -> -pi/2 - atan(1/value)
2440  f128 cmp, value_sign;
2441  {
2442  f128 one = F128::SetOne();
2443 
2444  // value_sign:
2445  // 1 if value > 1,
2446  // -1 if value < -1
2447  value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2448  cmp = F128::CmpLe(F128::Abs(value), one);
2449  }
2450  f128 x = F128::Select(cmp, value, F128::Recp(value));
2451 
2452  // atan(x) = x - 1/3 * x^3 + ... + (-1)^n/(2n+1) * x^(2n+1)
2453  // = x(1 - xx(1/3 - xx(1/5 - xx(1/7 - xx(1/9 - xx(1/11 - xx(1/13 - xx(1/15 - xx(1/17)...)
2454  // NOTE:
2455  // DO NOT USE TAYLOR SERIES
2456  // minmax approximation(the output of Remez algorithm)
2457  f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2458  f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2459  f128 xx = F128::Mult(x, x);
2460  f128 result;
2461  result = F128::MultSub<3>(coeff1, xx, F128::SetValue<2>(coeff1, each_select32), each_select32);
2462  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1, each_select32));
2463  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1, each_select32));
2464  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0, each_select32));
2465  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0, each_select32));
2466  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0, each_select32));
2467  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0, each_select32));
2468 
2469  result = F128::Mult(result, x);
2470  result = F128::MultSub(xx, result, x);
2471 
2472  f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2473  f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2474  result = F128::Select(cmp, result, result_another);
2475  return result;
2476 }
2477 
2478 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT {
2479  // y / x -> value
2480  // 0 / - -> pi | sign(y)
2481  // 0 / + -> 0 | sign(y)
2482  // y / Inf -> 0 | sign(y)
2483  // y / -Inf -> pi | sign(y)
2484  // y!=0 / 0 -> pi/2 | sign(y)
2485  // y!=0 / - -> atan(y/x) + pi | sign(y)
2486  // Inf / x -> pi/2 | sign(y)
2487  // +-Inf / Inf -> pi/4 | sign(y)
2488  // +-Inf / -Inf -> 3pi/4 | sign(y)
2489  // otherwise -> atan(y/x)
2490 
2491  // sx = sign(x), sy = sign(y)
2492  // infx = isinf(x), infy = isinf(y)
2493  // zerox = iszero(x), zeroy = iszero(y)
2494  // posx = x > 0
2495  const f128 signmask = F128::SetSignMask();
2496  // const f128 sx = F128::And(x, signmask);
2497  const f128 sy = F128::And(y, signmask);
2498  const f128 infx = F128::IsInfinite(x);
2499  const f128 infy = F128::IsInfinite(y);
2500  const f128 zerox = F128::CmpEqZero(x);
2501  const f128 zeroy = F128::CmpEqZero(y);
2502  const f128 posx = F128::CmpGtZero(x);
2503 
2504  // v =
2505  // infy ?
2506  // infx ?
2507  // posx ? (pi/4 | sy) : (3pi/4 | sy)
2508  // : (pi/2 | sy)
2509  // : zeroy ?
2510  // posx ? (0 | sy) : (pi | sy)
2511  // : zerox ? (pi/2 | sy) : TrueMask;
2512  const f128 cval = F128::LoadA16(atan2_cvalue_);
2513  const f128 pi = F128::Or(sy, F128::SetValue<0>(cval, each_select32));
2514  const f128 pi_34 = F128::Or(sy, F128::SetValue<1>(cval, each_select32));
2515  const f128 pi_2 = F128::Or(sy, F128::SetValue<2>(cval, each_select32));
2516  const f128 pi_4 = F128::Or(sy, F128::SetValue<3>(cval, each_select32));
2517 
2518  f128 v = F128::Select(
2519  infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2520  F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2521 
2522 // mask = EqInt(v, full);
2523 // result = Atan(y/x) + (posx ? (0 | sy) : (pi | sy))
2524 // return mask ? result : v;
2525 #if defined(NLIB_F128_SIMD_NOUSE)
2526  f128 mask;
2527  mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2528  mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2529  mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2530  mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2531 #elif defined(CAFE)
2532  // select makes 0xFFFFFFFFUL -> 0xFF7FFFFFUL
2533  f128 mask;
2534  mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2535  mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2536  mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2537  mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2538 #else
2539  f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v),
2540  I128::SetValue(-1, each_int8)));
2541 #endif
2542  f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2543  return F128::Select(mask, result, v);
2544 }
2545 
2546 NLIB_M2(f128) F128::ArcSin(f128arg value) NLIB_NOEXCEPT {
2547  // asin(x) = atan2 (x, sqrt ((1.0 + x) * (1.0 - x)))
2548  f128 one = F128::SetOne();
2549  f128 tmp = F128::MultSub(value, value, one);
2550  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2551  return F128::ArcTan2(value, argx);
2552 }
2553 
2554 NLIB_M2(f128) F128::ArcCos(f128arg value) NLIB_NOEXCEPT {
2555  // acos(x) = atan2 (sqrt ((1.0 + x) * (1.0 - x)), x)
2556  f128 one = F128::SetOne();
2557  f128 tmp = F128::MultSub(value, value, one);
2558  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2559  return F128::ArcTan2(argx, value);
2560 }
2561 
2562 // see _mm_movemask_ps of SSE
2563 NLIB_M2(int) F128::MoveMask(f128arg value) NLIB_NOEXCEPT { // NOLINT
2564 #ifdef NLIB_F128_SIMD_NOUSE
2565  uint8_t ret = 0;
2566  ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2567  ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2568  ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2569  ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2570  return ret;
2571 #elif defined(NLIB_SSE41)
2572  return static_cast<uint8_t>(_mm_movemask_ps(value));
2573 #elif defined(NLIB_NEON)
2574  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2575  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2576  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2577  uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2578 # ifdef __aarch64__
2579  return vaddvq_u32(a);
2580 # else
2581  uint16x4_t tmp = vmovn_u32(a);
2582  tmp = vpadd_u16(tmp, tmp);
2583  tmp = vpadd_u16(tmp, tmp);
2584  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2585 # endif
2586 #elif defined(CAFE)
2587  int tmp = (value.vec.u[0] >> 31);
2588  tmp |= (value.vec.u[1] >> 30) & 2;
2589  tmp |= (value.vec.u[2] >> 29) & 4;
2590  tmp |= (value.vec.u[3] >> 28) & 8;
2591  return tmp;
2592 #endif
2593 }
2594 
2595 // true if value[i] == 0 for all i
2596 NLIB_M2(bool) F128::IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT { // NOLINT
2597 #ifdef NLIB_F128_SIMD_NOUSE
2598  return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2599 #elif defined(NLIB_SSE41)
2600  i128 casted = F128::CastToI128(value);
2601  return _mm_testz_si128(casted, casted) != 0;
2602 #elif defined(NLIB_NEON)
2603 # ifdef __aarch64__
2604  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2605  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2606 # else
2607  int32x4_t casted = vreinterpretq_s32_f32(value);
2608  int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2609  return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2610 # endif
2611 #elif defined(CAFE)
2612  uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2613  return (tmp & 0x80000000U) == 0;
2614 #endif
2615 }
2616 
2617 // true if value[i] == 0xFFFFFFFF for all i
2618 NLIB_M2(bool) F128::IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT { // NOLINT
2619 #ifdef NLIB_F128_SIMD_NOUSE
2620  return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2621  value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2622 #elif defined(NLIB_SSE41)
2623  i128 casted = F128::CastToI128(value);
2624  return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2625 #elif defined(NLIB_NEON)
2626 # ifdef __aarch64__
2627  uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2628  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2629 # else
2630  int32x4_t casted = vreinterpretq_s32_f32(value);
2631  int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2632  return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2633 # endif
2634 #elif defined(CAFE)
2635  uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2636  return (tmp & 0x80000000U) != 0;
2637 #endif
2638 }
2639 
2640 template <size_t N>
2641 // r = value[N]
2642 NLIB_M(float) F128::GetFloatFromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2643  NLIB_STATIC_ASSERT(N < 4);
2644 #ifdef NLIB_F128_SIMD_NOUSE
2645  return value.vec.v[N];
2646 #elif defined(NLIB_SSE41)
2647  float dest;
2648  _MM_EXTRACT_FLOAT(dest, value, N);
2649  return dest;
2650 #elif defined(NLIB_NEON)
2651  return vgetq_lane_f32(value, N);
2652 #elif defined(CAFE)
2653  return value.vec.ps[N / 2][N % 2];
2654 #endif
2655 }
2656 
2657 template <size_t N>
2658 // r = *reinterpret_cast<uint32_t*>(&value[N])
2659 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value) NLIB_NOEXCEPT { // NOLINT
2660  NLIB_STATIC_ASSERT(N < 4);
2661 #ifdef NLIB_F128_SIMD_NOUSE
2662  return value.vec.u[N];
2663 #elif defined(NLIB_SSE41)
2664  return _mm_extract_ps(value, N);
2665 #elif defined(NLIB_NEON)
2666  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2667  return vgetq_lane_u32(tmp, N);
2668 #elif defined(CAFE)
2669  return value.vec.u[N];
2670 #endif
2671 }
2672 
2673 // r = value[idx]
2674 NLIB_M2(float) F128::GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT { // NOLINT
2675 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2676  return value.vec.v[idx];
2677 #elif defined(NLIB_SSE41)
2678  float dest;
2679  switch (idx) {
2680  case 0:
2681  _MM_EXTRACT_FLOAT(dest, value, 0);
2682  break;
2683  case 1:
2684  _MM_EXTRACT_FLOAT(dest, value, 1);
2685  break;
2686  case 2:
2687  _MM_EXTRACT_FLOAT(dest, value, 2);
2688  break;
2689  case 3:
2690  _MM_EXTRACT_FLOAT(dest, value, 3);
2691  break;
2692  default:
2693  NLIB_ASSUME(0);
2694  break;
2695  }
2696  return dest;
2697 #elif defined(NLIB_NEON)
2698  switch (idx) {
2699  case 0:
2700  return vgetq_lane_f32(value, 0);
2701  case 1:
2702  return vgetq_lane_f32(value, 1);
2703  case 2:
2704  return vgetq_lane_f32(value, 2);
2705  case 3:
2706  return vgetq_lane_f32(value, 3);
2707  default:
2708  NLIB_ASSUME(0);
2709  break;
2710  }
2711 #endif
2712 }
2713 
2714 // r = *reinterpret_cast<uint32_t*>(&value[idx])
2715 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT {
2716 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2717  return value.vec.u[idx];
2718 #elif defined(NLIB_SSE41)
2719  switch (idx) {
2720  case 0:
2721  return static_cast<uint32_t>(_mm_extract_ps(value, 0));
2722  case 1:
2723  return static_cast<uint32_t>(_mm_extract_ps(value, 1));
2724  case 2:
2725  return static_cast<uint32_t>(_mm_extract_ps(value, 2));
2726  case 3:
2727  return static_cast<uint32_t>(_mm_extract_ps(value, 3));
2728  default:
2729  NLIB_ASSUME(0);
2730  break;
2731  }
2732 #elif defined(NLIB_NEON)
2733  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2734  switch (idx) {
2735  case 0:
2736  return vgetq_lane_u32(tmp, 0);
2737  case 1:
2738  return vgetq_lane_u32(tmp, 1);
2739  case 2:
2740  return vgetq_lane_u32(tmp, 2);
2741  case 3:
2742  return vgetq_lane_u32(tmp, 3);
2743  default:
2744  NLIB_ASSUME(0);
2745  break;
2746  }
2747 #endif
2748 }
2749 
2750 template <size_t N>
2751 // r = value, r[N] = v
2752 NLIB_M(f128) F128::SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT { // NOLINT
2753  NLIB_STATIC_ASSERT(N < 4);
2754 #ifdef NLIB_F128_SIMD_NOUSE
2755  f128 ret = value;
2756  ret.vec.v[N] = v;
2757  return ret;
2758 #elif defined(NLIB_SSE41)
2759  f128 tmp = _mm_set_ss(v);
2760  return _mm_insert_ps(value, tmp, N << 4);
2761 #elif defined(NLIB_NEON)
2762  return __builtin_constant_p(v) ?
2763  F128::Permute<N == 0 ? 4 : 0,
2764  N == 1 ? 5 : 1,
2765  N == 2 ? 6 : 2,
2766  N == 3 ? 7 : 3>(value, vdupq_n_f32(v)) :
2767  vsetq_lane_f32(v, value, N);
2768 #elif defined(CAFE)
2769  f128 ret = value;
2770  ret.vec.ps[N / 2][N % 2] = v;
2771  return ret;
2772 #endif
2773 }
2774 
2775 // r = value, r[i] = v
2776 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT {
2777 #ifdef NLIB_F128_SIMD_NOUSE
2778  f128 ret = value;
2779  ret.vec.v[i] = v;
2780  return ret;
2781 #elif defined(NLIB_SSE41)
2782  f128 tmp = _mm_set_ss(v);
2783  switch (i) {
2784  case 0:
2785  return _mm_insert_ps(value, tmp, 0x00);
2786  case 1:
2787  return _mm_insert_ps(value, tmp, 0x10);
2788  case 2:
2789  return _mm_insert_ps(value, tmp, 0x20);
2790  case 3:
2791  return _mm_insert_ps(value, tmp, 0x30);
2792  default:
2793  NLIB_ASSUME(0);
2794  break;
2795  }
2796 #elif defined(NLIB_NEON)
2797  switch (i) {
2798  case 0:
2799  return F128::SetFloatToLane<0>(value, v);
2800  case 1:
2801  return F128::SetFloatToLane<1>(value, v);
2802  case 2:
2803  return F128::SetFloatToLane<2>(value, v);
2804  case 3:
2805  return F128::SetFloatToLane<3>(value, v);
2806  default:
2807  NLIB_ASSUME(0);
2808  break;
2809  }
2810 #elif defined(CAFE)
2811  f128 ret = value;
2812  switch (i) {
2813  case 0:
2814  ret.vec.ps[0][0] = v;
2815  break;
2816  case 1:
2817  ret.vec.ps[0][1] = v;
2818  break;
2819  case 2:
2820  ret.vec.ps[1][0] = v;
2821  break;
2822  default:
2823  ret.vec.ps[1][1] = v;
2824  break;
2825  }
2826  return ret;
2827 #endif
2828 }
2829 
2830 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
2831 namespace detail {
2832 
2833 template <bool IsHighA, bool IsHighB>
2834 float32x2_t F64Merge(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT;
2835 
2836 template <>
2837 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2838 #ifdef __aarch64__
2839  return vtrn1_f32(a, b);
2840 #else
2841  return vtrn_f32(a, b).val[0];
2842 #endif
2843 };
2844 
2845 template <>
2846 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2847 #ifdef __aarch64__
2848  return vtrn1_f32(vrev64_f32(a), b);
2849 #else
2850  return vtrn_f32(vrev64_f32(a), b).val[0];
2851 #endif
2852 };
2853 
2854 template <>
2855 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2856 #ifdef __aarch64__
2857  return vtrn1_f32(a, vrev64_f32(b));
2858 #else
2859  return vtrn_f32(a, vrev64_f32(b)).val[0];
2860 #endif
2861 };
2862 
2863 template <>
2864 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2865 #ifdef __aarch64__
2866  return vtrn2_f32(a, b);
2867 #else
2868  return vtrn_f32(a, b).val[1];
2869 #endif
2870 };
2871 
2872 template <int Z>
2873 float32x2_t F128SwizzleGet64(f128arg value) NLIB_NOEXCEPT;
2874 
2875 template <>
2876 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<0>(f128arg value) NLIB_NOEXCEPT {
2877  return vget_low_f32(value);
2878 }
2879 
2880 template <>
2881 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<1>(f128arg value) NLIB_NOEXCEPT {
2882  return vget_high_f32(value);
2883 }
2884 
2885 template <int X0, int X1>
2886 struct F128SwizzleHelper2 {
2887  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2888  float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2889  float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2890  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2891  }
2892 };
2893 
2894 template <int X>
2895 struct F128SwizzleHelper2<X, X> {
2896  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2897  float32x2_t x = F128SwizzleGet64<X / 2>(value);
2898  return vdup_lane_f32(x, (X & 1));
2899  }
2900 };
2901 
2902 template <>
2903 struct F128SwizzleHelper2<0, 1> {
2904  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2905  return vget_low_f32(value);
2906  }
2907 };
2908 
2909 template <>
2910 struct F128SwizzleHelper2<0, 2> {
2911  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2912 #ifdef __aarch64__
2913  return vget_low_f32(vuzp1q_f32(value, value));
2914 #else
2915  float32x2_t lo = vget_low_f32(value);
2916  float32x2_t hi = vget_high_f32(value);
2917  return vzip_f32(lo, hi).val[0];
2918 #endif
2919  }
2920 };
2921 
2922 template <>
2923 struct F128SwizzleHelper2<0, 3> {
2924  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2925  float32x2_t lo = vget_low_f32(value);
2926  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2927 #ifdef __aarch64__
2928  return vzip1_f32(lo, hi);
2929 #else
2930  return vzip_f32(lo, hi).val[0];
2931 #endif
2932  }
2933 };
2934 
2935 template <>
2936 struct F128SwizzleHelper2<1, 0> {
2937  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2938  return vrev64_f32(vget_low_f32(value));
2939  }
2940 };
2941 
2942 template <>
2943 struct F128SwizzleHelper2<1, 2> {
2944  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2945  float32x2_t lo = vget_low_f32(value);
2946  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2947 #ifdef __aarch64__
2948  return vzip2_f32(lo, hi);
2949 #else
2950  return vzip_f32(lo, hi).val[1];
2951 #endif
2952  }
2953 };
2954 
2955 template <>
2956 struct F128SwizzleHelper2<1, 3> {
2957  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2958 #ifdef __aarch64__
2959  return vget_low_f32(vuzp2q_f32(value, value));
2960 #else
2961  float32x2_t lo = vget_low_f32(value);
2962  float32x2_t hi = vget_high_f32(value);
2963  return vzip_f32(lo, hi).val[1];
2964 #endif
2965  }
2966 };
2967 
2968 template <>
2969 struct F128SwizzleHelper2<2, 0> {
2970  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2971 #ifdef __aarch64__
2972  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2973 #else
2974  float32x2_t lo = vget_low_f32(value);
2975  float32x2_t hi = vget_high_f32(value);
2976  return vzip_f32(hi, lo).val[0];
2977 #endif
2978  }
2979 };
2980 
2981 template <>
2982 struct F128SwizzleHelper2<2, 1> {
2983  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2984 #ifdef __aarch64__
2985  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2986 #else
2987  float32x2_t lo = vget_low_f32(value);
2988  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2989  return vzip_f32(hi, lo).val[1];
2990 #endif
2991  }
2992 };
2993 
2994 template <>
2995 struct F128SwizzleHelper2<2, 3> {
2996  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2997  return vget_high_f32(value);
2998  }
2999 };
3000 
3001 template <>
3002 struct F128SwizzleHelper2<3, 0> {
3003  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3004  float32x2_t lo = vget_low_f32(value);
3005  float32x2_t hi = vrev64_f32(vget_high_f32(value));
3006 #ifdef __aarch64__
3007  return vzip1_f32(hi, lo);
3008 #else
3009  return vzip_f32(hi, lo).val[0];
3010 #endif
3011  }
3012 };
3013 
3014 template <>
3015 struct F128SwizzleHelper2<3, 1> {
3016  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3017  float32x2_t lo = vget_low_f32(value);
3018  float32x2_t hi = vget_high_f32(value);
3019 #ifdef __aarch64__
3020  return vzip2_f32(hi, lo);
3021 #else
3022  return vzip_f32(hi, lo).val[1];
3023 #endif
3024  }
3025 };
3026 
3027 template <>
3028 struct F128SwizzleHelper2<3, 2> {
3029  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3030  return vrev64_f32(vget_high_f32(value));
3031  }
3032 };
3033 
3034 template <int V0, int V1, int V2, int V3>
3035 struct F128SwizzleHelper {
3036  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3037  return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3038  detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3039  }
3040 };
3041 
3042 template <int Vx, int Vy>
3043 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3044  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3045  float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3046  return vcombine_f32(tmp, tmp);
3047  }
3048 };
3049 
3050 template <int V>
3051 struct F128SwizzleHelper<V, V, V, V> {
3052  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3053  return F128::SetValue<V>(value, each_select32);
3054  }
3055 };
3056 
3057 } // namespace detail
3058 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3059 namespace detail {
3060 
3061 template <int X0, int X1>
3062 struct F128SwizzleHelper {
3063  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT;
3064 };
3065 
3066 template<>
3067 struct F128SwizzleHelper<0, 0> {
3068  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3069  (void)v1;
3070  return __PS_MERGE00(v0, v0);
3071  }
3072 };
3073 
3074 template<>
3075 struct F128SwizzleHelper<0, 1> {
3076  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3077  (void)v1;
3078  return v0;
3079  }
3080 };
3081 
3082 template<>
3083 struct F128SwizzleHelper<0, 2> {
3084  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3085  return __PS_MERGE00(v0, v1);
3086  }
3087 };
3088 
3089 template<>
3090 struct F128SwizzleHelper<0, 3> {
3091  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3092  return __PS_MERGE01(v0, v1);
3093  }
3094 };
3095 
3096 template<>
3097 struct F128SwizzleHelper<1, 0> {
3098  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3099  (void)v1;
3100  return __PS_MERGE10(v0, v0);
3101  }
3102 };
3103 
3104 template<>
3105 struct F128SwizzleHelper<1, 1> {
3106  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3107  (void)v1;
3108  return __PS_MERGE11(v0, v0);
3109  }
3110 };
3111 
3112 template<>
3113 struct F128SwizzleHelper<1, 2> {
3114  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3115  return __PS_MERGE10(v0, v1);
3116  }
3117 };
3118 
3119 template<>
3120 struct F128SwizzleHelper<1, 3> {
3121  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3122  return __PS_MERGE11(v0, v1);
3123  }
3124 };
3125 
3126 template<>
3127 struct F128SwizzleHelper<2, 0> {
3128  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3129  return __PS_MERGE00(v1, v0);
3130  }
3131 };
3132 
3133 template<>
3134 struct F128SwizzleHelper<2, 1> {
3135  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3136  return __PS_MERGE01(v1, v0);
3137  }
3138 };
3139 
3140 template<>
3141 struct F128SwizzleHelper<2, 2> {
3142  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3143  (void)v0;
3144  return __PS_MERGE00(v1, v1);
3145  }
3146 };
3147 
3148 template<>
3149 struct F128SwizzleHelper<2, 3> {
3150  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3151  (void)v0;
3152  return v1;
3153  }
3154 };
3155 
3156 template<>
3157 struct F128SwizzleHelper<3, 0> {
3158  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3159  return __PS_MERGE10(v1, v0);
3160  }
3161 };
3162 
3163 template<>
3164 struct F128SwizzleHelper<3, 1> {
3165  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3166  return __PS_MERGE11(v1, v0);
3167  }
3168 };
3169 
3170 template<>
3171 struct F128SwizzleHelper<3, 2> {
3172  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3173  (void)v0;
3174  return __PS_MERGE10(v1, v1);
3175  }
3176 };
3177 
3178 template<>
3179 struct F128SwizzleHelper<3, 3> {
3180  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3181  (void)v0;
3182  return __PS_MERGE11(v1, v1);
3183  }
3184 };
3185 
3186 } // namespace detail
3187 #endif
3188 
3189 template <int V0, int V1, int V2, int V3>
3190 // r[0] = value[V0], r[1] = value[V1], r[2] = value[V2], r[3] = value[V3]
3191 NLIB_M(f128) F128::Swizzle(f128arg value) NLIB_NOEXCEPT {
3192  NLIB_STATIC_ASSERT(V0 < 4);
3193  NLIB_STATIC_ASSERT(V1 < 4);
3194  NLIB_STATIC_ASSERT(V2 < 4);
3195  NLIB_STATIC_ASSERT(V3 < 4);
3196 #if defined(NLIB_F128_SIMD_NOUSE)
3197  f128 ret;
3198  ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3199  ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3200  ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3201  ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3202  return ret;
3203 #elif __has_builtin(__builtin_shufflevector)
3204  return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3205 #elif defined(NLIB_SSE41)
3206  return _mm_shuffle_ps(value, value,
3207  _MM_SHUFFLE(V3 != -1 ? V3 : 3,
3208  V2 != -1 ? V2 : 2,
3209  V1 != -1 ? V1 : 1,
3210  V0 != -1 ? V0 : 0));
3211 #elif defined(NLIB_NEON)
3212  return detail::F128SwizzleHelper<
3213  V0 != -1 ? V0 : 0,
3214  V1 != -1 ? V1 : 1,
3215  V2 != -1 ? V2 : 2,
3216  V3 != -1 ? V3 : 3>::Swizzle(value);
3217 #elif defined(CAFE)
3218  f128 ret;
3219  ret.vec.ps[0] = detail::F128SwizzleHelper<
3220  (V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3221  ret.vec.ps[1] = detail::F128SwizzleHelper<
3222  (V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3223  return ret;
3224 #endif
3225 }
3226 
3227 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3228 // Swizzle specialization for NEON
3229 template <>
3230 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value) NLIB_NOEXCEPT {
3231 #ifdef __aarch64__
3232  return vzip1q_f32(value, value);
3233 #else
3234  return vzipq_f32(value, value).val[0];
3235 #endif
3236 }
3237 template <>
3238 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value) NLIB_NOEXCEPT {
3239 #ifdef __aarch64__
3240  return vtrn1q_f32(value, value);
3241 #else
3242  return vtrnq_f32(value, value).val[0];
3243 #endif
3244 }
3245 template <>
3246 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value) NLIB_NOEXCEPT {
3247  return value;
3248 }
3249 template <>
3250 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value) NLIB_NOEXCEPT {
3251 #ifdef __aarch64__
3252  return vuzp1q_f32(value, value);
3253 #else
3254  return vuzpq_f32(value, value).val[0];
3255 #endif
3256 }
3257 template <>
3258 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value) NLIB_NOEXCEPT {
3259  return vrev64q_f32(value);
3260 }
3261 template <>
3262 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3263 #ifdef __aarch64__
3264  return vtrn2q_f32(value, value);
3265 #else
3266  return vtrnq_f32(value, value).val[1];
3267 #endif
3268 }
3269 template <>
3270 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value) NLIB_NOEXCEPT {
3271  uint32x4_t ival = vreinterpretq_u32_f32(value);
3272  uint32x4_t rotated = vextq_u32(ival, ival, 1);
3273  return vreinterpretq_f32_u32(rotated);
3274 }
3275 template <>
3276 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value) NLIB_NOEXCEPT {
3277 #ifdef __aarch64__
3278  return vuzp2q_f32(value, value);
3279 #else
3280  return vuzpq_f32(value, value).val[1];
3281 #endif
3282 }
3283 template <>
3284 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3285 #ifdef __aarch64__
3286  return vzip2q_f32(value, value);
3287 #else
3288  return vzipq_f32(value, value).val[1];
3289 #endif
3290 }
3291 template <>
3292 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value) NLIB_NOEXCEPT {
3293  uint32x4_t ival = vreinterpretq_u32_f32(value);
3294  uint32x4_t rotated = vextq_u32(ival, ival, 2);
3295  return vreinterpretq_f32_u32(rotated);
3296 }
3297 template <>
3298 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value) NLIB_NOEXCEPT {
3299  uint32x4_t ival = vreinterpretq_u32_f32(value);
3300  uint32x4_t rotated = vextq_u32(ival, ival, 3);
3301  return vreinterpretq_f32_u32(rotated);
3302 }
3303 #endif
3304 
3305 namespace detail {
3306 
3307 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3308 template <bool UseBlend, bool UseShuffle, int V0, int V1, int V2, int V3>
3309 struct F128PermuteHelper2 {
3310  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3311  f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3312  f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3313  return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3314  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3315  }
3316 };
3317 
3318 template <bool UseShuffle, int V0, int V1, int V2, int V3>
3319 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3320  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3321  return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3322  ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3323  }
3324 };
3325 
3326 template <int V0, int V1, int V2, int V3>
3327 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3328  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3329  return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3330  _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3331  }
3332 };
3333 
3334 template <>
3335 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3336  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3337  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3338  return _mm_castsi128_ps(tmp);
3339  }
3340 };
3341 
3342 template <>
3343 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3344  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3345  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3346  return _mm_castsi128_ps(tmp);
3347  }
3348 };
3349 
3350 template <>
3351 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3352  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3353  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3354  return _mm_castsi128_ps(tmp);
3355  }
3356 };
3357 
3358 template <int V>
3359 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3360  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3361  NLIB_STATIC_ASSERT(V > 3);
3362  return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3363  }
3364 };
3365 
3366 template <int V>
3367 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3368  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3369  NLIB_STATIC_ASSERT(V > 3);
3370  return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3371  }
3372 };
3373 
3374 template <int V>
3375 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3376  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3377  NLIB_STATIC_ASSERT(V > 3);
3378  return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3379  }
3380 };
3381 
3382 template <int V>
3383 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3384  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3385  NLIB_STATIC_ASSERT(V > 3);
3386  return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3387  }
3388 };
3389 
3390 template <int V>
3391 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3392  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3393  NLIB_STATIC_ASSERT(V < 4);
3394  return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3395  }
3396 };
3397 
3398 template <int V>
3399 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3400  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3401  NLIB_STATIC_ASSERT(V < 4);
3402  return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3403  }
3404 };
3405 
3406 template <int V>
3407 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3408  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3409  NLIB_STATIC_ASSERT(V < 4);
3410  return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3411  }
3412 };
3413 
3414 template <int V>
3415 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3416  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3417  NLIB_STATIC_ASSERT(V < 4);
3418  return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3419  }
3420 };
3421 
3422 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3423 struct F128PermuteHelper {
3424  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3425  return F128PermuteHelper2<
3426  ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3427  ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3428  V0, V1, V2, V3>::Permute(a, b);
3429  }
3430 };
3431 
3432 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3433 
3434 template <int Z>
3435 float32x2_t F128PermuteGet64(f128arg a, f128arg b) NLIB_NOEXCEPT;
3436 
3437 template <>
3438 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3439  NLIB_UNUSED(b);
3440  return vget_low_f32(a);
3441 }
3442 template <>
3443 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3444  NLIB_UNUSED(b);
3445  return vget_high_f32(a);
3446 }
3447 template <>
3448 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3449  NLIB_UNUSED(a);
3450  return vget_low_f32(b);
3451 }
3452 template <>
3453 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3454  NLIB_UNUSED(a);
3455  return vget_high_f32(b);
3456 }
3457 
3458 template <int X0, int X1>
3459 struct F128PermuteHelper2 {
3460  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3461  float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3462  float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3463  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3464  }
3465 };
3466 
3467 template <int X>
3468 struct F128PermuteHelper2<X, X> {
3469  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3470  float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3471  return vdup_lane_f32(x, (X & 1));
3472  }
3473 };
3474 
3475 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3476 struct F128PermuteHelper {
3477  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3478  return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3479  F128PermuteHelper2<V2, V3>::Permute(a, b));
3480  }
3481 };
3482 
3483 template <>
3484 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3485  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3486  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3487  return vreinterpretq_f32_s32(tmp);
3488  }
3489 };
3490 
3491 template <>
3492 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3493  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3494  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3495  return vreinterpretq_f32_s32(tmp);
3496  }
3497 };
3498 
3499 template <>
3500 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3501  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3502  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3503  return vreinterpretq_f32_s32(tmp);
3504  }
3505 };
3506 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3507 template<int R0, int R1, int VAR0, int VAR1>
3508 struct F128PermuteHelper2 {
3509  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT;
3510 };
3511 
3512 template<int R0, int R1>
3513 struct F128PermuteHelper2<R0, R1, 0, 0> {
3514  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3515  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3516  }
3517 };
3518 
3519 template<int R0, int R1>
3520 struct F128PermuteHelper2<R0, R1, 0, 1> {
3521  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3522  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3523  }
3524 };
3525 
3526 template<int R0, int R1>
3527 struct F128PermuteHelper2<R0, R1, 0, 2> {
3528  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3529  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3530  }
3531 };
3532 
3533 template<int R0, int R1>
3534 struct F128PermuteHelper2<R0, R1, 0, 3> {
3535  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3536  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3537  }
3538 };
3539 
3540 template<int R0, int R1>
3541 struct F128PermuteHelper2<R0, R1, 1, 0> {
3542  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3543  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3544  }
3545 };
3546 
3547 template<int R0, int R1>
3548 struct F128PermuteHelper2<R0, R1, 1, 1> {
3549  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3550  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3551  }
3552 };
3553 
3554 template<int R0, int R1>
3555 struct F128PermuteHelper2<R0, R1, 1, 2> {
3556  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3557  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3558  }
3559 };
3560 
3561 template<int R0, int R1>
3562 struct F128PermuteHelper2<R0, R1, 1, 3> {
3563  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3564  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3565  }
3566 };
3567 
3568 template<int R0, int R1>
3569 struct F128PermuteHelper2<R0, R1, 2, 0> {
3570  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3571  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3572  }
3573 };
3574 
3575 template<int R0, int R1>
3576 struct F128PermuteHelper2<R0, R1, 2, 1> {
3577  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3578  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3579  }
3580 };
3581 
3582 template<int R0, int R1>
3583 struct F128PermuteHelper2<R0, R1, 2, 2> {
3584  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3585  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3586  }
3587 };
3588 
3589 template<int R0, int R1>
3590 struct F128PermuteHelper2<R0, R1, 2, 3> {
3591  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3592  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3593  }
3594 };
3595 
3596 template<int R0, int R1>
3597 struct F128PermuteHelper2<R0, R1, 3, 0> {
3598  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3599  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3600  }
3601 };
3602 
3603 template<int R0, int R1>
3604 struct F128PermuteHelper2<R0, R1, 3, 1> {
3605  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3606  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3607  }
3608 };
3609 
3610 template<int R0, int R1>
3611 struct F128PermuteHelper2<R0, R1, 3, 2> {
3612  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3613  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3614  }
3615 };
3616 
3617 template<int R0, int R1>
3618 struct F128PermuteHelper2<R0, R1, 3, 3> {
3619  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3620  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3621  }
3622 };
3623 
3624 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3625 struct F128PermuteHelper {
3626  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3627  f128 ret;
3628  f32x2 x0 = a.vec.ps[0];
3629  f32x2 x1 = a.vec.ps[1];
3630  f32x2 x2 = b.vec.ps[0];
3631  f32x2 x3 = b.vec.ps[1];
3632  ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3633  ::Permute(x0, x1, x2, x3);
3634  ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3635  ::Permute(x0, x1, x2, x3);
3636  return ret;
3637  }
3638 };
3639 #else
3640 template <bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3641 struct F128PermuteHelper {
3642  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3643  f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3644  F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3645  F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3646  F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3647  return ret;
3648  }
3649 };
3650 #endif
3651 
3652 template <int V0, int V1, int V2, int V3>
3653 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3654  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3655  NLIB_UNUSED(b);
3656  return F128::Swizzle<V0, V1, V2, V3>(a);
3657  }
3658 };
3659 
3660 template <int V0, int V1, int V2, int V3>
3661 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3662  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3663  NLIB_UNUSED(a);
3664  return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3665  }
3666 };
3667 
3668 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3669 // Permute specialization for SSE4.1
3670 template <>
3671 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3672  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3673  return _mm_unpacklo_ps(a, b);
3674  }
3675 };
3676 template <>
3677 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3678  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3679  return _mm_unpacklo_ps(b, a);
3680  }
3681 };
3682 template <>
3683 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3684  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3685  return _mm_unpackhi_ps(a, b);
3686  }
3687 };
3688 template <>
3689 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3690  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3691  return _mm_unpackhi_ps(b, a);
3692  }
3693 };
3694 #endif
3695 
3696 template<int V0, int V1, int V2, int V3>
3697 struct F128PermuteDontCareHelper {
3698  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3699  NLIB_STATIC_ASSERT(V0 < 8);
3700  NLIB_STATIC_ASSERT(V1 < 8);
3701  NLIB_STATIC_ASSERT(V2 < 8);
3702  NLIB_STATIC_ASSERT(V3 < 8);
3703  static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3704  static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3705  return detail::F128PermuteHelper< arg1, arg2,
3706  V0, V1, V2, V3 >::Permute(a, b);
3707  }
3708 };
3709 
3710 template<int V1, int V2, int V3>
3711 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3712  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3713  NLIB_STATIC_ASSERT(V1 < 8);
3714  NLIB_STATIC_ASSERT(V2 < 8);
3715  NLIB_STATIC_ASSERT(V3 < 8);
3716  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3717  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3718  }
3719 };
3720 
3721 template<int V0, int V2, int V3>
3722 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3723  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3724  NLIB_STATIC_ASSERT(V0 < 8);
3725  NLIB_STATIC_ASSERT(V2 < 8);
3726  NLIB_STATIC_ASSERT(V3 < 8);
3727  static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3728  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3729  }
3730 };
3731 
3732 template<int V0, int V1, int V3>
3733 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3734  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3735  NLIB_STATIC_ASSERT(V0 < 8);
3736  NLIB_STATIC_ASSERT(V1 < 8);
3737  NLIB_STATIC_ASSERT(V3 < 8);
3738  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3739  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3740  }
3741 };
3742 
3743 template<int V0, int V1, int V2>
3744 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3745  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3746  NLIB_STATIC_ASSERT(V0 < 8);
3747  NLIB_STATIC_ASSERT(V1 < 8);
3748  NLIB_STATIC_ASSERT(V2 < 8);
3749  static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3750  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3751  }
3752 };
3753 
3754 template<int V2, int V3>
3755 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3756  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3757  NLIB_STATIC_ASSERT(V2 < 8);
3758  NLIB_STATIC_ASSERT(V3 < 8);
3759  static const int V0 = (V2 < 4) ? 0 : 4;
3760  return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3761  }
3762 };
3763 
3764 template<int V1, int V2>
3765 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3766  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3767  NLIB_STATIC_ASSERT(V1 < 8);
3768  NLIB_STATIC_ASSERT(V2 < 8);
3769  static const int V0 = (V1 & 1) ? V1 - 1: V1;
3770  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3771  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3772  }
3773 };
3774 
3775 template<int V0, int V1>
3776 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3777  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3778  NLIB_STATIC_ASSERT(V0 < 8);
3779  NLIB_STATIC_ASSERT(V1 < 8);
3780  static const int V2 = (V1 < 4) ? 2 : 6;
3781  return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3782  }
3783 };
3784 
3785 template<int V0, int V3>
3786 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3787  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3788  NLIB_STATIC_ASSERT(V0 < 8);
3789  NLIB_STATIC_ASSERT(V3 < 8);
3790  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3791  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3792  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3793  }
3794 };
3795 
3796 template<int V0, int V2>
3797 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3798  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3799  NLIB_STATIC_ASSERT(V0 < 8);
3800  NLIB_STATIC_ASSERT(V2 < 8);
3801  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3802  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3803  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3804  }
3805 };
3806 
3807 template<int V1, int V3>
3808 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3809  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3810  NLIB_STATIC_ASSERT(V1 < 8);
3811  NLIB_STATIC_ASSERT(V3 < 8);
3812  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3813  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3814  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3815  }
3816 };
3817 
3818 template<int V>
3819 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3820  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3821  NLIB_STATIC_ASSERT(V < 8);
3822  static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3823  static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3824  static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3825  return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3826  }
3827 };
3828 
3829 template<int V>
3830 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3831  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3832  NLIB_STATIC_ASSERT(V < 8);
3833  static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3834  static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3835  static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3836  return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3837  }
3838 };
3839 
3840 template<int V>
3841 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3842  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3843  NLIB_STATIC_ASSERT(V < 8);
3844  static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3845  static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3846  static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3847  return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3848  }
3849 };
3850 
3851 template<int V>
3852 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3853  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3854  NLIB_STATIC_ASSERT(V < 8);
3855  static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3856  static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3857  static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3858  return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3859  }
3860 };
3861 
3862 template <>
3863 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3864  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3865  NLIB_UNUSED(b);
3866  return a;
3867  }
3868 };
3869 
3870 } // namespace detail
3871 
3872 template <int V0, int V1, int V2, int V3>
3873 // r[0] = V0 < 4 ? a[V0] : b[V0 - 4], .....
3874 NLIB_M(f128) F128::Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3875 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE)
3876  return __builtin_shufflevector(a, b,
3877  (V0 != 8 ? V0 : -1),
3878  (V1 != 8 ? V1 : -1),
3879  (V2 != 8 ? V2 : -1),
3880  (V3 != 8 ? V3 : -1));
3881 #else
3882  return detail::F128PermuteDontCareHelper <
3883  V0 != -1 ? V0 : 8,
3884  V1 != -1 ? V1 : 8,
3885  V2 != -1 ? V2 : 8,
3886  V3 != -1 ? V3 : 8>::Permute(a, b);
3887 #endif
3888 }
3889 
3890 template <bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
3891 // r[i] = InsertLane(i) ? insert_value[i] : value[i]
3892 // note that splat[0] = splat[1] = splat[2] = splat[3]
3893 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT {
3894 #if defined(NLIB_NEON)
3895  const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3896  const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3897  const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3898  const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3899 #else
3900  // SSE4.1 has _mm_blend_ps()
3901  const int v0 = SplatLane0 ? 4 : 0;
3902  const int v1 = SplatLane1 ? 5 : 1;
3903  const int v2 = SplatLane2 ? 6 : 2;
3904  const int v3 = SplatLane3 ? 7 : 3;
3905 #endif
3906  return F128::Permute<v0, v1, v2, v3>(value, splat);
3907 }
3908 
3909 NLIB_M2(f128) F128::Exp2(f128arg value) NLIB_NOEXCEPT {
3910 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
3911  f128 ret;
3912  ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3913  ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3914  ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3915  ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3916  return ret;
3917 #else
3918  i128 iround = F128::ConvertToI128Round(value);
3919  f128 fround = F128::ConvertFromI128(iround);
3920  f128 x = F128::Sub(value, fround);
3921  f128 xx = F128::Mult(x, x);
3922 
3923  f128 P = F128::LoadA16(F128::exp2_P_);
3924  f128 Q = F128::LoadA16(F128::exp2_Q_);
3925 
3926  f128 px;
3927  // px = P[0]
3928  // px = px * xx + P[1]
3929  // px = px * xx + P[2]
3930  // px = x * px;
3931  px = F128::MultAdd<0>(P, xx, F128::SetValue<1>(P, each_select32), each_select32);
3932  px = F128::MultAdd(px, xx, F128::SetValue<2>(P, each_select32));
3933  px = F128::Mult(x, px);
3934 
3935  f128 qx;
3936  // qx = xx + Q[0]
3937  // qx = qx * xx + Q[1]
3938  qx = F128::Add(xx, F128::SetValue<0>(Q, each_select32));
3939  qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q, each_select32));
3940 
3941  x = F128::Div(px, F128::Sub(qx, px));
3942  x = F128::MultAdd<3>(Q, x, F128::SetValue<2>(Q, each_select32), each_select32);
3943 
3944  // x = x * 2^iround
3945  iround = I128::Add32(iround, I128::SetValue(127, each_int32));
3946  iround = I128::ShiftLeftLogical32(iround, 23);
3947  x = F128::Mult(x, F128::CastFromI128(iround));
3948 
3949  // NOTE:
3950  // overflow not checked
3951  return x;
3952 #endif
3953 }
3954 
3955 NLIB_M(f128) F128::ExpE(f128arg value) NLIB_NOEXCEPT {
3956  static const float log2e = 1.44269504088896340736f;
3957  return Exp2(F128::Mult(log2e, value));
3958 }
3959 
3960 NLIB_M(f128) F128::SinH(f128arg value) NLIB_NOEXCEPT {
3961  static const float log2e = 1.44269504088896340736f;
3962  f128 negOne = F128::SetValue(-1.f, each_float);
3963  f128 v0 = F128::MultAdd(log2e, value, negOne);
3964  f128 v1 = F128::MultSub(log2e, value, negOne);
3965  f128 e0 = Exp2(v0);
3966  f128 e1 = Exp2(v1);
3967  return F128::Sub(e0, e1);
3968 }
3969 
3970 NLIB_M(f128) F128::CosH(f128arg value) NLIB_NOEXCEPT {
3971  static const float log2e = 1.44269504088896340736f;
3972  f128 negOne = F128::SetValue(-1.f, each_float);
3973  f128 v0 = F128::MultAdd(log2e, value, negOne);
3974  f128 v1 = F128::MultSub(log2e, value, negOne);
3975  f128 e0 = Exp2(v0);
3976  f128 e1 = Exp2(v1);
3977  return F128::Add(e0, e1);
3978 }
3979 
3980 NLIB_M(f128) F128::TanH(f128arg value) NLIB_NOEXCEPT {
3981  // 1 - 2 * (1 + expE(2x))
3982  f128 cvalue = F128::LoadA16(tanh_cvalue_);
3983  f128 e = F128::Mult<0>(cvalue, value, each_select32);
3984  e = F128::Exp2(e);
3985  f128 half = F128::SetValue<2>(cvalue, each_select32);
3986  e = F128::MultAdd(half, e, half);
3987  e = F128::Recp(e);
3988  return F128::Sub(F128::SetValue<1>(cvalue, each_select32), e);
3989 }
3990 
3991 NLIB_M2(f128) F128::Tan(f128arg value) NLIB_NOEXCEPT {
3992 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
3993  f128 ret;
3994  ret.vec.v[0] = tanf(value.vec.v[0]);
3995  ret.vec.v[1] = tanf(value.vec.v[1]);
3996  ret.vec.v[2] = tanf(value.vec.v[2]);
3997  ret.vec.v[3] = tanf(value.vec.v[3]);
3998  return ret;
3999 #else
4000  // Cody and Waite algorithm
4001  f128 C = F128::LoadA16(&F128::tan_c_[0]);
4002 
4003  // g = round(value / (pi/2))
4004  f128 g = F128::Round(F128::Mult<0>(C, value, each_select32));
4005  f128 nearXAxis;
4006  {
4007  i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U, each_uint32));
4008  i128 cmp = I128::CmpEq32(t0, I128::SetZero());
4009  nearXAxis = F128::CastFromI128(cmp);
4010  }
4011 
4012  // f = value - (pi/2) * g
4013  f128 f = F128::MultSub<1>(C, g, value, each_select32);
4014  f = F128::MultSub<2>(C, g, f, each_select32);
4015 
4016  f128 nearAxis = F128::CmpNearEqZero(f, F128::SetValue<3>(C, each_select32));
4017 
4018  f128 P = F128::LoadA16(&F128::tan_p_[0]);
4019  f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4020 
4021  f128 ff = F128::Mult(f, f);
4022  f128 one = F128::SetValue<3>(P, each_select32);
4023 
4024  f128 p = F128::MultAdd<2>(P, ff, F128::SetValue<1>(P, each_select32), each_select32);
4025  p = F128::MultAdd(p, ff, F128::SetValue<0>(P, each_select32));
4026  p = F128::MultAdd(p, ff, one);
4027  p = F128::Mult(f, p);
4028 
4029  f128 q = F128::MultAdd<3>(Q, ff, F128::SetValue<2>(Q, each_select32), each_select32);
4030  q = F128::MultAdd(q, ff, F128::SetValue<1>(Q, each_select32));
4031  q = F128::MultAdd(q, ff, F128::SetValue<0>(Q, each_select32));
4032  q = F128::MultAdd(q, ff, one);
4033 
4034  p = F128::Select(nearAxis, f, p);
4035  q = F128::Select(nearAxis, one, q);
4036 
4037  f128 r0 = F128::Div(p, q);
4038  f128 r1 = F128::Negate(F128::Recp(r0));
4039 
4040  return F128::Select(nearXAxis, r0, r1);
4041 #endif
4042 }
4043 
4044 NLIB_M2(f128) F128::Log2(f128arg value) NLIB_NOEXCEPT {
4045 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
4046  static const float scale = 1.4426950408889634f; // 1 / LogE(2.0)
4047  f128 ret;
4048  ret.vec.v[0] = logf(value.vec.v[0]);
4049  ret.vec.v[1] = logf(value.vec.v[1]);
4050  ret.vec.v[2] = logf(value.vec.v[2]);
4051  ret.vec.v[3] = logf(value.vec.v[3]);
4052  return F128::Mult(scale, ret);
4053 #else
4054  // x = frexp(value, &e)
4055  f128 x = F128::And(F128::SetValue(0x807FFFFFU, each_uint32), value);
4056  x = F128::Or(F128::SetValue(127U << 23, each_uint32), x);
4057  i128 e = I128::And(I128::SetValue(0x7F800000U, each_uint32), F128::CastToI128(value));
4058  e = I128::ShiftRightLogical32(e, 23);
4059  e = I128::Sub32(e, I128::SetValue(127U, each_uint32));
4060 
4061  x = F128::Sub(x, F128::SetOne());
4062  f128 z = F128::Mult(x, x);
4063  f128 y;
4064 
4065  f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4066  f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4067  f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4068 
4069  f128 p = F128::SetValue<0>(pq0, each_select32);
4070  p = F128::MultAdd(p, x, F128::SetValue<1>(pq0, each_select32));
4071  p = F128::MultAdd(p, x, F128::SetValue<2>(pq0, each_select32));
4072  p = F128::MultAdd(p, x, F128::SetValue<3>(pq0, each_select32));
4073  p = F128::MultAdd(p, x, F128::SetValue<0>(pq1, each_select32));
4074  p = F128::MultAdd(p, x, F128::SetValue<1>(pq1, each_select32));
4075 
4076  f128 q = F128::Add(x, F128::SetValue<2>(pq1, each_select32));
4077  q = F128::MultAdd(q, x, F128::SetValue<3>(pq1, each_select32));
4078  q = F128::MultAdd(q, x, F128::SetValue<0>(pq2, each_select32));
4079  q = F128::MultAdd(q, x, F128::SetValue<1>(pq2, each_select32));
4080  q = F128::MultAdd(q, x, F128::SetValue<2>(pq2, each_select32));
4081 
4082  y = F128::Mult(z, p);
4083  y = F128::Div(y, q);
4084  y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4085 
4086  f128 result;
4087  {
4088  // do not optimize
4089  f128 log2ea = F128::SetValue<3>(pq2, each_select32);
4090  result = F128::Mult(y, log2ea);
4091  result = F128::MultAdd(log2ea, x, result);
4092  result = F128::Add(result, y);
4093  result = F128::Add(result, x);
4094  result = F128::Add(result, F128::ConvertFromI128(e));
4095  }
4096 
4097  {
4098  f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4099 
4100  // value is NaN -> NaN
4101  f128 is_nan = F128::IsNaN(value);
4102  f128 nan = F128::SetValue<0>(nan_inf, each_select32);
4103  result = F128::Select(is_nan, nan, result);
4104 
4105  f128 is_inf = F128::IsInfinite(value);
4106  f128 is_pos = F128::CmpGtZero(value);
4107 
4108  // value == inf -> +inf
4109  f128 inf = F128::SetValue<1>(nan_inf, each_select32);
4110  f128 is_pos_inf = F128::And(is_inf, is_pos);
4111  result = F128::Select(is_pos_inf, inf, result);
4112 
4113  // value == 0 -> -inf
4114  f128 neg_inf = F128::SetValue<3>(nan_inf, each_select32);
4115  f128 is_zero = F128::CmpEqZero(value);
4116  result = F128::Select(is_zero, neg_inf, result);
4117 
4118  // value < 0 -> -NaN
4119  f128 neg_nan = F128::SetValue<2>(nan_inf, each_select32);
4120  f128 is_neg = F128::CmpLtZero(value);
4121  result = F128::Select(is_neg, neg_nan, result);
4122 
4123  // otherwise -> Log2(value)
4124  }
4125 
4126  return result;
4127 #endif
4128 }
4129 
4130 NLIB_M(f128) F128::LogE(f128arg value) NLIB_NOEXCEPT {
4131 #ifdef NLIB_F128_SIMD_NOUSE
4132  f128 ret;
4133  ret.vec.v[0] = logf(value.vec.v[0]);
4134  ret.vec.v[1] = logf(value.vec.v[1]);
4135  ret.vec.v[2] = logf(value.vec.v[2]);
4136  ret.vec.v[3] = logf(value.vec.v[3]);
4137  return ret;
4138 #else
4139  f128 x = F128::Log2(value);
4140  static const float recp_log2e = 0.6931471805597018f;
4141  return F128::Mult(recp_log2e, x);
4142 #endif
4143 }
4144 
4145 #undef NLIB_M
4146 #undef NLIB_M2
4147 #endif // NLIB_DOXYGEN
4148 
4149 typedef f128 SimdVector;
4150 typedef f128arg SimdVectorArg;
4151 typedef f128 SimdQuaternion;
4152 typedef f128arg SimdQuaternionArg;
4153 typedef f128 SimdPlane;
4154 typedef f128arg SimdPlaneArg;
4155 typedef f128 SimdSphere;
4156 typedef f128arg SimdSphereArg;
4157 
4158 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4159 struct NLIB_ALIGNAS(16) SimdMatrix {
4160 #else
4161 struct SimdMatrix {
4162 #endif
4163  public:
4165  SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) NLIB_NOEXCEPT {
4166  r[0] = r0;
4167  r[1] = r1;
4168  r[2] = r2;
4169  r[3] = r3;
4170  }
4171  SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11, float m12,
4172  float m13, float m20, float m21, float m22, float m23, float m30, float m31,
4173  float m32, float m33) NLIB_NOEXCEPT;
4174  explicit SimdMatrix(const float* p) NLIB_NOEXCEPT;
4175 
4176  public:
4177  f128 r[4];
4178 };
4179 
4180 inline SimdMatrix::SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11,
4181  float m12, float m13, float m20, float m21, float m22, float m23,
4182  float m30, float m31, float m32, float m33) NLIB_NOEXCEPT {
4183  r[0] = F128::SetValue(m00, m01, m02, m03);
4184  r[1] = F128::SetValue(m10, m11, m12, m13);
4185  r[2] = F128::SetValue(m20, m21, m22, m23);
4186  r[3] = F128::SetValue(m30, m31, m32, m33);
4187 }
4188 
4189 inline SimdMatrix::SimdMatrix(const float* p) NLIB_NOEXCEPT {
4190  uintptr_t algn = reinterpret_cast<uintptr_t>(p) & 15;
4191  NLIB_ASSERT((algn & 3) == 0);
4192  switch (algn >> 2) {
4193  case 0:
4194  r[0] = F128::LoadA16(p);
4195  r[1] = F128::LoadA16(p + 4);
4196  r[2] = F128::LoadA16(p + 8);
4197  r[3] = F128::LoadA16(p + 12);
4198  break;
4199  case 1:
4200  r[0] = F128::LoadA4(p);
4201  r[1] = F128::LoadA4(p + 4);
4202  r[2] = F128::LoadA4(p + 8);
4203  r[3] = F128::LoadA4(p + 12);
4204  break;
4205  case 2:
4206  r[0] = F128::LoadA8(p);
4207  r[1] = F128::LoadA8(p + 4);
4208  r[2] = F128::LoadA8(p + 8);
4209  r[3] = F128::LoadA8(p + 12);
4210  break;
4211  default:
4212  NLIB_ASSUME(0);
4213  break;
4214  }
4215 }
4216 
4217 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
4218 typedef const SimdMatrix& SimdMatrixArg;
4219 #else
4220 typedef const SimdMatrix SimdMatrixArg;
4221 #endif
4222 
4223 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE)
4224 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4225  { \
4226  f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \
4227  f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \
4228  f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \
4229  f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \
4230  row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \
4231  row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \
4232  row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \
4233  row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \
4234  }
4235 #elif defined(NLIB_NEON)
4236 # ifdef __aarch64__
4237 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4238  { \
4239  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4240  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4241  uint64x2_t row0_, row1_, row2_, row3_; \
4242  row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4243  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4244  row0 = vreinterpretq_f32_u64(row0_); \
4245  row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4246  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4247  row1 = vreinterpretq_f32_u64(row1_); \
4248  row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4249  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4250  row2 = vreinterpretq_f32_u64(row2_); \
4251  row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4252  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4253  row3 = vreinterpretq_f32_u64(row3_); \
4254  }
4255 # else
4256 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4257  { \
4258  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4259  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4260  row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \
4261  row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \
4262  row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \
4263  row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \
4264  }
4265 # endif
4266 #elif defined(CAFE)
4267 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4268  { \
4269  f32x2 tmp0, tmp1; \
4270  tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \
4271  tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \
4272  row0.vec.ps[0] = tmp0; \
4273  row1.vec.ps[0] = tmp1; \
4274  tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \
4275  tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \
4276  row2.vec.ps[1] = tmp0; \
4277  row3.vec.ps[1] = tmp1; \
4278  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4279  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4280  row0.vec.ps[1] = row2.vec.ps[0]; \
4281  row1.vec.ps[1] = row3.vec.ps[0]; \
4282  row2.vec.ps[0] = tmp0; \
4283  row3.vec.ps[0] = tmp1; \
4284  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4285  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4286  row0.vec.ps[1] = tmp0; \
4287  row1.vec.ps[1] = tmp1; \
4288  }
4289 #endif
4290 
4292  float x;
4293  float y;
4294  float z;
4295 };
4296 
4298  float x;
4299  float y;
4300  float z;
4301  float w;
4302 };
4303 
4304 struct NLIB_VIS_PUBLIC Float2x3 {
4305  float m[2][3];
4306 };
4307 
4309  float m[3][3];
4310 };
4311 
4312 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4313 struct NLIB_ALIGNAS(16) Float3x4 {
4314 #else
4315 struct Float3x4 {
4316 #endif
4317  float m[3][4];
4318 };
4319 
4320 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4321 struct NLIB_ALIGNAS(16) Float4x3 {
4322 #else
4323 struct Float4x3 {
4324 #endif
4325  float m[4][3];
4326 };
4327 
4328 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4329 struct NLIB_ALIGNAS(16) Float4x4 {
4330 #else
4331 struct Float4x4 {
4332 #endif
4333  float m[4][4];
4334 };
4335 
4336 } // namespace simd
4337 NLIB_NAMESPACE_END
4338 
4339 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
float x
3次元ベクトルのx座標です。
Definition: SimdFloat.h:4292
SimdMatrix()
デフォルトコンストラクタです。
Definition: SimdFloat.h:4164
4x4行列を扱う関数が集められたクラスです。
Definition: SimdMatrix.h:28
#define NLIB_ALWAYS_INLINE
コンパイラに関数をインライン展開するように強く示します。
Definition: Platform_unix.h:95
視錐台を表すクラスです。
Definition: SimdGeometry.h:118
クォータニオンを扱う関数が集められたクラスです。
f128arg SimdVectorArg
f128argがtypedefされています。
Definition: SimdFloat.h:4150
128bitの単精度浮動小数点数用SIMDレジスタを2つ持つ型です。
Definition: SimdFloat.h:48
float x
4次元ベクトルのx座標です。
Definition: SimdFloat.h:4298
float y
4次元ベクトルのy座標です。
Definition: SimdFloat.h:4299
整数のSIMD演算を行うためのクラスや関数が実装されています。
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
引数から行列をセットアップします。
Definition: SimdFloat.h:4165
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
Definition: SimdInt.h:57
#define NLIB_VIS_HIDDEN
関数やクラス等のシンボルをライブラリの外部に公開しません。
Definition: Platform_unix.h:86
f128arg SimdSphereArg
f128argがtypedefされています。
Definition: SimdFloat.h:4156
#define NLIB_VIS_PUBLIC
関数やクラス等のシンボルをライブラリの外部に公開します。
Definition: Platform_unix.h:87
OBB(有向境界ボックス)を表すクラスです。中心座標(center)とxyz軸方向の大きさ(extent)及び回転クォータニ...
Definition: SimdGeometry.h:93
static f128 ShiftRight(f128arg a, f128arg b) noexcept
a を右にシフトして空いた部分にb の要素を順にシフトする形で設定します。
Definition: SimdFloat.h:346
空の構造体で単精度浮動小数点数を示すためのタグです。
Definition: SimdFloat.h:67
#define NLIB_ASSUME(cond)
cond が真であることを示してコンパイラに最適化のヒントを与えます。
Definition: Platform.h:593
包含関係の判定を行う関数をまとめたクラスです。
Definition: SimdGeometry.h:281
3次元空間上の球を扱う静的メンバ関数が集められたクラスです。このクラスはインスタンス化できません。 ...
Definition: SimdGeometry.h:53
constexpr const each_float_tag each_float
each_float_tag型の定数オブジェクトで、単精度浮動小数点数を示すためのタグです。
Definition: SimdFloat.h:68
f128arg SimdQuaternionArg
f128argがtypedefされています。
Definition: SimdFloat.h:4152
nlib_i128_t i128
nlib_i128_tがtypedefされています。
Definition: SimdInt.h:76
3次元空間上の平面を扱う関数が集められたクラスです。
Definition: SimdGeometry.h:34
f128arg SimdPlaneArg
f128argがtypedefされています。
Definition: SimdFloat.h:4154
3次元ベクトルの計算を行う関数が集められたクラスです。全ての関数でレーン3に設定された値は無視されます...
Definition: SimdVector3.h:26
static f128 RotateLeft(f128arg value) noexcept
4個の単精度浮動小数点数を左にN 個分回転させます。
Definition: SimdFloat.h:333
距離(の2乗)の計算を行う関数をまとめたクラスです。
Definition: SimdGeometry.h:147
4次元ベクトルをメモリから読み出したりメモリに書き出したりするための型です。float型のx, y, z, wをデータメンバとして保持します。
Definition: SimdFloat.h:4297
const f128 f128arg
const f128, 又はconst f128&がtypedefされています。
Definition: SimdFloat.h:78
4x4行列を保持する構造体です。
Definition: SimdFloat.h:4161
float z
4次元ベクトルのz座標です。
Definition: SimdFloat.h:4300
nlib_f128x2_t f128x2
nlib_f128x2_tがtypedefされています。
Definition: SimdFloat.h:73
f128 SimdSphere
f128がtypedefされています。球を扱う場合に利用されます。
Definition: SimdFloat.h:4155
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いて単精度浮動小数点数のSIMD演算を行うためのクラ...
Definition: SimdFloat.h:93
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
Definition: Config.h:99
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
Definition: Config.h:93
開発環境別の設定が書かれるファイルです。
4次元ベクトルの計算を行う関数が集められたクラスです。
Definition: SimdVector4.h:24
3次元ベクトルをメモリから読み出したりメモリに書き出したりするための型です。float型のx, y, zをデータメンバとして保持します。
Definition: SimdFloat.h:4291
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
Definition: Config.h:234
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:47
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
Definition: SimdInt.h:63
4x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x3の配列で16バイ...
Definition: SimdFloat.h:4323
3x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x3の配列です。 ...
Definition: SimdFloat.h:4308
空の構造体で32bitの符号なし整数を示すためのタグです。
Definition: SimdInt.h:43
static f128 RotateRight(f128arg value) noexcept
4個の単精度浮動小数点数を右にN 個分回転させます。
Definition: SimdFloat.h:340
float y
3次元ベクトルのy座標です。
Definition: SimdFloat.h:4293
nlib_f128_t f128
nlib_f128_tがtypedefされています。
Definition: SimdFloat.h:71
float z
3次元ベクトルのz座標です。
Definition: SimdFloat.h:4294
3次元空間におけるAABB(軸並行境界ボックス)を表すクラスです。最小座標(point_min)と最大座標(point_max)を...
Definition: SimdGeometry.h:74
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
Definition: SimdInt.h:49
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
Definition: Config.h:149
float w
4次元ベクトルのw座標です。
Definition: SimdFloat.h:4301
交差の判定を行う関数をまとめたクラスです。
Definition: SimdGeometry.h:191
f128 SimdQuaternion
f128がtypedefされています。クォータニオンを扱う場合に利用されます。
Definition: SimdFloat.h:4151
4x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x4の配列で16バイ...
Definition: SimdFloat.h:4331
3x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x4の配列で16バイ...
Definition: SimdFloat.h:4315
f128 SimdPlane
f128がtypedefされています。平面を扱う場合に利用されます。
Definition: SimdFloat.h:4153
__m128 nlib_f128_t
128bitの単精度浮動小数点数用SIMDレジスタのための型です。
Definition: SimdFloat.h:47
f128 SimdVector
f128がtypedefされています。3次元ベクトル又は4次元ベクトルを扱う場合に利用されます。 ...
Definition: SimdFloat.h:4149