nlib
SimdFloat.h
Go to the documentation of this file.
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
18 
19 #ifdef NN_PLATFORM_CTR
20 #ifndef __USE_C99_MATH
21 #define __USE_C99_MATH
22 #endif
23 #endif
24 #include <math.h>
25 #include <float.h>
26 
27 #include "nn/nlib/Config.h"
28 #include "nn/nlib/simd/SimdInt.h"
29 
30 #ifndef INFINITY
31 #define INFINITY ((float)(1e+300 * 1e+300))
32 #endif
33 
34 #if !defined(NLIB_SIMD) && !defined(CAFE)
35 #define NLIB_F128_SIMD_NOUSE
36 #endif
37 
38 #ifdef NLIB_F128_SIMD_NOUSE
39 typedef struct {
40  union {
41  float v[4];
42  uint32_t u[4];
43  } vec;
44 } nlib_f128_t;
45 typedef struct {
46  nlib_f128_t val[2];
48 #elif defined(NLIB_SSE41)
49 typedef __m128 nlib_f128_t;
50 typedef struct {
51  nlib_f128_t val[2];
53 #elif defined(NLIB_NEON)
54 typedef float32x4_t nlib_f128_t;
55 typedef float32x4x2_t nlib_f128x2_t;
56 #elif defined(CAFE)
57 typedef struct {
58  union {
59  f32x2 ps[2];
60  float v[4];
61  uint32_t u[4];
62  } vec;
63 } nlib_f128_t;
64 typedef struct {
65  nlib_f128_t val[2];
67 #endif
68 
69 NLIB_NAMESPACE_BEGIN
70 namespace simd {
71 
72 // use each_float for the argument value
73 struct each_float_tag {};
75 
76 // __m128(SSE), float32x4_t(NEON)
77 typedef nlib_f128_t f128;
78 // float32x4x2_t(NEON)
80 
81 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
82 typedef const f128& f128arg;
83 #else
84 typedef const f128 f128arg;
85 #endif
86 
87 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
88 typedef const f128& f128arg_ex;
89 #else
90 typedef const f128 f128arg_ex;
91 #endif
92 
93 #if !defined(_MSC_VER) && !defined(__vectorcall)
94 #define __vectorcall
95 #endif
96 
98  public:
99  static f128 __vectorcall SetValue(float v, each_float_tag) NLIB_NOEXCEPT;
100  static f128 __vectorcall SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT;
101  static f128 __vectorcall SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT;
102  template<size_t N>
103  static f128 __vectorcall SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT;
104  static f128 __vectorcall SetZero() NLIB_NOEXCEPT;
105  static f128 __vectorcall Set1000() NLIB_NOEXCEPT;
106  static f128 __vectorcall Set0100() NLIB_NOEXCEPT;
107  static f128 __vectorcall Set0010() NLIB_NOEXCEPT;
108  static f128 __vectorcall Set0001() NLIB_NOEXCEPT;
109  template<size_t N>
110  static f128 __vectorcall SetZeroToLane(f128arg value) NLIB_NOEXCEPT;
111  static f128 __vectorcall SetOne() NLIB_NOEXCEPT;
112  static f128 __vectorcall SetNegativeOne() NLIB_NOEXCEPT;
113  static f128 __vectorcall SetEpsilon() NLIB_NOEXCEPT;
114  static f128 __vectorcall SetInfinity() NLIB_NOEXCEPT;
115  static f128 __vectorcall SetNaN() NLIB_NOEXCEPT;
116  static f128 __vectorcall SetSignMask() NLIB_NOEXCEPT;
117 
118  static f128 __vectorcall LoadA16(const float* p) NLIB_NOEXCEPT;
119  static f128 __vectorcall LoadA8(const float* p) NLIB_NOEXCEPT;
120  static f128 __vectorcall LoadA4(const float* p) NLIB_NOEXCEPT;
121  static f128 __vectorcall LoadA16(uintptr_t p) NLIB_NOEXCEPT;
122  static f128 __vectorcall LoadA8(uintptr_t p) NLIB_NOEXCEPT;
123  static f128 __vectorcall LoadA4(uintptr_t p) NLIB_NOEXCEPT;
124  static f128 __vectorcall LoadA16(intptr_t p) NLIB_NOEXCEPT;
125  static f128 __vectorcall LoadA8(intptr_t p) NLIB_NOEXCEPT;
126  static f128 __vectorcall LoadA4(intptr_t p) NLIB_NOEXCEPT;
127 
128  static void __vectorcall StoreA16(float* p, f128arg value) NLIB_NOEXCEPT;
129  static void __vectorcall StoreA8(float* p, f128arg value) NLIB_NOEXCEPT;
130  static void __vectorcall StoreA4(float* p, f128arg value) NLIB_NOEXCEPT;
131  static void __vectorcall StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
132  static void __vectorcall StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
133  static void __vectorcall StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
134  static void __vectorcall StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT;
135  static void __vectorcall StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
136  static void __vectorcall StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
137 
138  static void __vectorcall StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT;
139  static void __vectorcall StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT;
140  static void __vectorcall StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
141  static void __vectorcall StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
142  static void __vectorcall StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
143  static void __vectorcall StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
144 
145  static void __vectorcall StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT;
146  static void __vectorcall StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT;
147  static void __vectorcall StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
148  static void __vectorcall StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT;
149  static void __vectorcall StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT;
150  static void __vectorcall StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT;
151 
152  /*
153  static f128 __vectorcall ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT;
154  static f128 __vectorcall ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT;
155  static f128 __vectorcall ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT;
156  */
157 
158 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
159  static f128 __vectorcall ConvertFromI128(i128 value) NLIB_NOEXCEPT;
160  static i128 __vectorcall ConvertToI128Round(f128 value) NLIB_NOEXCEPT;
161  static i128 __vectorcall ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT;
162 
163  static f128 __vectorcall CastFromI128(i128 value) NLIB_NOEXCEPT;
164  static i128 __vectorcall CastToI128(f128 value) NLIB_NOEXCEPT;
165 
166  template<int N>
167  static f128 __vectorcall ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT;
168  template<int N>
169  static i128 __vectorcall ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT;
170 #endif
171 
172  static f128 __vectorcall Add(f128arg a, f128arg b) NLIB_NOEXCEPT;
173  static f128 __vectorcall Sub(f128arg a, f128arg b) NLIB_NOEXCEPT;
174  static f128 __vectorcall Mult(f128arg a, f128arg b) NLIB_NOEXCEPT;
175  static f128 __vectorcall Mult(float a, f128arg b) NLIB_NOEXCEPT;
176  template<size_t N>
177  static f128 __vectorcall Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT;
178  static f128 __vectorcall Div(f128arg a, f128arg b) NLIB_NOEXCEPT;
179  static f128 __vectorcall Negate(f128arg value) NLIB_NOEXCEPT;
180  template<bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
181  static f128 __vectorcall NegateEx(f128arg value) NLIB_NOEXCEPT;
182  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
183  static f128 __vectorcall MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
184  template<size_t N>
185  static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
187  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT;
188  static f128 __vectorcall MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT;
189  template<size_t N>
190  static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
192  static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT;
193  static f128 __vectorcall Abs(f128arg value) NLIB_NOEXCEPT;
194  static f128 __vectorcall AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT;
195 
196  //
197  // Min/Max
198  //
199 
200  static f128 __vectorcall Max(f128arg a, f128arg b) NLIB_NOEXCEPT;
201  static f128 __vectorcall Min(f128arg a, f128arg b) NLIB_NOEXCEPT;
202  static f128 __vectorcall PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT;
203  static f128 __vectorcall PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT;
204  static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT;
205  static f128 __vectorcall Saturate(f128arg value) NLIB_NOEXCEPT;
206 
207  //
208  // Reciprocal/Sqrt
209  //
210 
211  static f128 __vectorcall Recp(f128arg value) NLIB_NOEXCEPT;
212  static f128 __vectorcall RecpEst(f128arg value) NLIB_NOEXCEPT;
213  static f128 __vectorcall Sqrt(f128arg value) NLIB_NOEXCEPT;
214  static f128 __vectorcall SqrtEst(f128arg value) NLIB_NOEXCEPT;
215  static f128 __vectorcall RecpSqrt(f128arg value) NLIB_NOEXCEPT;
216  static f128 __vectorcall RecpSqrtEst(f128arg value) NLIB_NOEXCEPT;
217 
218  //
219  // Round/Truncate
220  //
221 
222  static f128 __vectorcall Round(f128arg value) NLIB_NOEXCEPT;
223  static f128 __vectorcall Truncate(f128arg value) NLIB_NOEXCEPT;
224  static f128 __vectorcall Floor(f128arg value) NLIB_NOEXCEPT;
225  static f128 __vectorcall Ceil(f128arg value) NLIB_NOEXCEPT;
226 
227  //
228  // Logical Operations
229  //
230 
231  static f128 __vectorcall And(f128arg a, f128arg b) NLIB_NOEXCEPT;
232  static f128 __vectorcall Or(f128arg a, f128arg b) NLIB_NOEXCEPT;
233  static f128 __vectorcall Xor(f128arg a, f128arg b) NLIB_NOEXCEPT;
234  static f128 __vectorcall Not(f128arg a) NLIB_NOEXCEPT;
235  static f128 __vectorcall AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
236  static f128 __vectorcall OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT;
237 
238  //
239  // Comparison Operations
240  //
241 
242  static f128 __vectorcall CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT;
243  static f128 __vectorcall CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT;
244  static f128 __vectorcall CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT;
245  static f128 __vectorcall CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT;
246  static f128 __vectorcall CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT;
247  static f128 __vectorcall CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT;
248  static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT;
249  static f128 __vectorcall InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT;
250 
251  static f128 __vectorcall CmpEqZero(f128arg value) NLIB_NOEXCEPT;
252  static f128 __vectorcall CmpLtZero(f128arg value) NLIB_NOEXCEPT;
253  static f128 __vectorcall CmpLeZero(f128arg value) NLIB_NOEXCEPT;
254  static f128 __vectorcall CmpGtZero(f128arg value) NLIB_NOEXCEPT;
255  static f128 __vectorcall CmpGeZero(f128arg value) NLIB_NOEXCEPT;
256  static f128 __vectorcall CmpNeZero(f128arg value) NLIB_NOEXCEPT;
257  static f128 __vectorcall CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT;
258 
259  //
260  // Trigonometric function
261  //
262  static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
263  static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT;
264  static f128 __vectorcall ModAngle(f128arg value) NLIB_NOEXCEPT;
265  static f128 __vectorcall Sin(f128arg value) NLIB_NOEXCEPT;
266  static f128 __vectorcall Cos(f128arg value) NLIB_NOEXCEPT;
267  static f128x2 __vectorcall SinCos(f128arg value) NLIB_NOEXCEPT;
268  static f128 __vectorcall Tan(f128arg value) NLIB_NOEXCEPT;
269  static f128 __vectorcall SinH(f128arg value) NLIB_NOEXCEPT;
270  static f128 __vectorcall CosH(f128arg value) NLIB_NOEXCEPT;
271  static f128 __vectorcall TanH(f128arg value) NLIB_NOEXCEPT;
272  static f128 __vectorcall ArcSin(f128arg value) NLIB_NOEXCEPT;
273  static f128 __vectorcall ArcCos(f128arg value) NLIB_NOEXCEPT;
274  static f128 __vectorcall ArcTan(f128arg value) NLIB_NOEXCEPT;
275  static f128 __vectorcall ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT;
276  // and Est version needed?
277 
278  //
279  // Interpolation
280  //
281 
282  static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT;
283  static f128 __vectorcall Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
284  f128arg_ex t) NLIB_NOEXCEPT;
285  static f128 __vectorcall CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
286  f128arg_ex t) NLIB_NOEXCEPT;
287  static f128 __vectorcall BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
288  f128arg_ex g) NLIB_NOEXCEPT;
289 
290  //
291  // Exp/Log
292  //
293  static f128 __vectorcall Exp2(f128arg value) NLIB_NOEXCEPT;
294  static f128 __vectorcall ExpE(f128arg value) NLIB_NOEXCEPT;
295  static f128 __vectorcall Log2(f128arg value) NLIB_NOEXCEPT;
296  static f128 __vectorcall LogE(f128arg value) NLIB_NOEXCEPT; // not implemented
297 
298  //
299  // Misc
300  //
301 
302  static int __vectorcall MoveMask(f128arg value) NLIB_NOEXCEPT;
303  static bool __vectorcall IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT;
304  static bool __vectorcall IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT;
305  static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT;
306  static f128 __vectorcall IsNaN(f128arg value) NLIB_NOEXCEPT;
307  static f128 __vectorcall IsInfinite(f128arg value) NLIB_NOEXCEPT;
308 
309  //
310  // Get/Set
311  //
312 
313  template<size_t N>
314  static float __vectorcall GetFloatFromLane(f128arg value) NLIB_NOEXCEPT;
315  template<size_t N>
316  static uint32_t __vectorcall GetUint32FromLane(f128arg value) NLIB_NOEXCEPT;
317  static float __vectorcall GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
318  static uint32_t __vectorcall GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT;
319 
320  template<size_t N>
321  static f128 __vectorcall SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT;
322  static f128 __vectorcall SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT;
323 
324  //
325  // Swizzle/Permute
326  //
327 
328  template<int V0, int V1, int V2, int V3>
329  static f128 __vectorcall Swizzle(f128arg value) NLIB_NOEXCEPT;
330  template<int V0, int V1, int V2, int V3>
331  static f128 __vectorcall Permute(f128arg a, f128arg b) NLIB_NOEXCEPT;
332  template<bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
333  static f128 __vectorcall Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT;
334 
335  template<size_t N>
336  // left <- value[3], value[2], value[1], value[0] -> right
337  static f128 __vectorcall RotateLeft(f128arg value) NLIB_NOEXCEPT {
338  NLIB_STATIC_ASSERT(N < 4);
339  const size_t NN = 4 - N;
340  return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
341  }
342  template<size_t N>
343  // left <- value[3], value[2], value[1], value[0] -> right
344  static f128 __vectorcall RotateRight(f128arg value) NLIB_NOEXCEPT {
345  NLIB_STATIC_ASSERT(N < 4);
346  return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
347  }
348  template<size_t N>
349  // left <- b[3], ..., b[0], a[3], ..., a[0] -> right
350  static f128 __vectorcall ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
351  NLIB_STATIC_ASSERT(N < 4);
352  return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
353  }
354 
355  private:
356  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v1000_[4]; // 1.f, 0.f, 0.f, 0.f
357  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0100_[4]; // 0.f, 1.f, 0.f, 0.f
358  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0010_[4]; // 0.f, 0.f, 1.f, 0.f
359  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float v0001_[4]; // 0.f, 0.f, 0.f, 1.f
360 
361  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float pi_values_[4]; // pi, -pi, 2pi, pi/2
362  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const uint32_t nan_inf_[4]; // nan, inf, -nan, -inf
363 
364  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R0_[4];
365  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float hermite_R1_[4];
366 
367  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R0_[4];
368  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R1_[4];
369  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float catmull_R2_[4];
370 
371  // Those coefficients are from Cephes Math Library
372  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_cvalue_[4];
373  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float sin_coeff_[4];
374  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_cvalue_[4];
375  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float cos_coeff_[4];
376  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan_coeff_[8];
377  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float atan2_cvalue_[4];
378 
379  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_P_[4];
380  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float exp2_Q_[4];
381  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tanh_cvalue_[4];
382  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_p_[4];
383  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_q_[4];
384  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float tan_c_[4];
385  NLIB_ALIGNAS(16) static NLIB_VIS_PUBLIC const float log2_PQ_[12];
386 
387  F128(); // forbidden
388  friend class Vector3;
389  friend class Vector4;
390  friend class Matrix;
391  friend class Plane;
392  friend class Quaternion;
393  friend class Sphere;
394  friend class AxisAlignedBox;
395  friend class OrientedBox;
396  friend class Frustum;
397 
398  friend class DistanceSq;
399  friend class Intersection;
400  friend class Containment;
401 };
402 
403 #ifndef NLIB_DOXYGEN
404 
405 #undef NLIB_M
406 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
407 #define NLIB_M2(tp) inline tp __vectorcall
408 
409 // r[i] = v
410 NLIB_M(f128) F128::SetValue(float v, each_float_tag) NLIB_NOEXCEPT {
411 #ifdef NLIB_F128_SIMD_NOUSE
412  f128 ret;
413  ret.vec.v[0] = v;
414  ret.vec.v[1] = v;
415  ret.vec.v[2] = v;
416  ret.vec.v[3] = v;
417  return ret;
418 #elif defined(NLIB_SSE41)
419  return _mm_set1_ps(v);
420 #elif defined(NLIB_NEON)
421  return vdupq_n_f32(v);
422 #elif defined(CAFE)
423  f128 ret;
424  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
425  return ret;
426 #endif
427 }
428 
429 // r[i] = *reinterpret_cast<float*>(&v)
430 NLIB_M(f128) F128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
431 #ifdef NLIB_F128_SIMD_NOUSE
432  f128 ret;
433  ret.vec.u[0] = v;
434  ret.vec.u[1] = v;
435  ret.vec.u[2] = v;
436  ret.vec.u[3] = v;
437  return ret;
438 #elif defined(NLIB_SSE41)
439  union {
440  float f32;
441  uint32_t u32;
442  } tmp;
443  tmp.u32 = v;
444  return _mm_set1_ps(tmp.f32);
445 #elif defined(NLIB_NEON)
446  uint32x4_t tmp = vdupq_n_u32(v);
447  return vreinterpretq_f32_u32(tmp);
448 #elif defined(CAFE)
449  union {
450  float f32;
451  uint32_t u32;
452  } tmp;
453  tmp.u32 = v;
454  f128 ret;
455  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
456  return ret;
457 #endif
458 }
459 
460 // r[0] = a, r[1] = b, r[2] = c, r[3] = d
461 NLIB_M(f128) F128::SetValue(float a, float b, float c, float d) NLIB_NOEXCEPT {
462 #ifdef NLIB_F128_SIMD_NOUSE
463  f128 ret;
464  ret.vec.v[0] = a;
465  ret.vec.v[1] = b;
466  ret.vec.v[2] = c;
467  ret.vec.v[3] = d;
468  return ret;
469 #elif defined(NLIB_SSE41)
470  return _mm_set_ps(d, c, b, a);
471 #elif defined(NLIB_NEON)
472  union {
473  float f32[2];
474  uint64_t u64;
475  } tmp1, tmp2;
476  tmp1.f32[0] = a;
477  tmp1.f32[1] = b;
478  tmp2.f32[0] = c;
479  tmp2.f32[1] = d;
480  return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
481 #elif defined(CAFE)
482  f128 ret;
483  ret.vec.ps[0][0] = a;
484  ret.vec.ps[0][1] = b;
485  ret.vec.ps[1][0] = c;
486  ret.vec.ps[1][1] = d;
487  return ret;
488 #endif
489 }
490 
491 template<size_t N>
492 // r[i] = value[N]
493 NLIB_M(f128) F128::SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
494  NLIB_STATIC_ASSERT(N < 4);
495 #ifdef NLIB_F128_SIMD_NOUSE
496  f128 ret;
497  ret.vec.v[0] = value.vec.v[N];
498  ret.vec.v[1] = value.vec.v[N];
499  ret.vec.v[2] = value.vec.v[N];
500  ret.vec.v[3] = value.vec.v[N];
501  return ret;
502 #elif defined(NLIB_SSE41)
503  return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
504 #elif defined(NLIB_NEON)
505  float32x2_t tmp = vget_low_f32(value);
506  return vdupq_lane_f32(tmp, N);
507 #elif defined(CAFE)
508  f128 ret;
509  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
510  return ret;
511 #endif
512 }
513 
514 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
515 template<>
516 NLIB_M(f128)
517 F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
518  float32x2_t tmp = vget_high_f32(value);
519  return vdupq_lane_f32(tmp, 0);
520 }
521 template<>
522 NLIB_M(f128)
523 F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
524  float32x2_t tmp = vget_high_f32(value);
525  return vdupq_lane_f32(tmp, 1);
526 }
527 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
528 template<>
529 NLIB_M(f128)
530 F128::SetValue<0>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
531  f128 ret;
532  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
533  return ret;
534 }
535 template<>
536 NLIB_M(f128)
537 F128::SetValue<1>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
538  f128 ret;
539  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
540  return ret;
541 }
542 template<>
543 NLIB_M(f128)
544 F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
545  f128 ret;
546  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
547  return ret;
548 }
549 template<>
550 NLIB_M(f128)
551 F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
552  f128 ret;
553  ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
554  return ret;
555 }
556 #endif
557 
558 // r[i] = 0.f
559 NLIB_M(f128) F128::SetZero() NLIB_NOEXCEPT {
560 #ifdef NLIB_F128_SIMD_NOUSE
561  f128 ret;
562  ret.vec.v[0] = 0;
563  ret.vec.v[1] = 0;
564  ret.vec.v[2] = 0;
565  ret.vec.v[3] = 0;
566  return ret;
567 #elif defined(NLIB_SSE41)
568  return _mm_setzero_ps();
569 #elif defined(NLIB_NEON)
570  return vdupq_n_f32(0);
571 #elif defined(CAFE)
572  f128 ret;
573  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
574  return ret;
575 #endif
576 }
577 
578 NLIB_M(f128) F128::Set1000() NLIB_NOEXCEPT {
579 #ifdef NLIB_F128_SIMD_NOUSE
580  f128 ret;
581  ret.vec.v[0] = 1.f;
582  ret.vec.v[1] = 0;
583  ret.vec.v[2] = 0;
584  ret.vec.v[3] = 0;
585  return ret;
586 #elif defined(NLIB_NEON)
587  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
588  float32x2_t x00 = vcreate_f32(0ULL);
589  return vcombine_f32(x10, x00);
590 #else
591  return F128::LoadA16(F128::v1000_);
592 #endif
593 }
594 
595 NLIB_M(f128) F128::Set0100() NLIB_NOEXCEPT {
596 #ifdef NLIB_F128_SIMD_NOUSE
597  f128 ret;
598  ret.vec.v[0] = 0;
599  ret.vec.v[1] = 1.f;
600  ret.vec.v[2] = 0;
601  ret.vec.v[3] = 0;
602  return ret;
603 #elif defined(NLIB_NEON)
604  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
605  float32x2_t x00 = vcreate_f32(0ULL);
606  return vcombine_f32(x01, x00);
607 #else
608  return F128::LoadA16(F128::v0100_);
609 #endif
610 }
611 
612 NLIB_M(f128) F128::Set0010() NLIB_NOEXCEPT {
613 #ifdef NLIB_F128_SIMD_NOUSE
614  f128 ret;
615  ret.vec.v[0] = 0;
616  ret.vec.v[1] = 0;
617  ret.vec.v[2] = 1.f;
618  ret.vec.v[3] = 0;
619  return ret;
620 #elif defined(NLIB_NEON)
621  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
622  float32x2_t x00 = vcreate_f32(0ULL);
623  return vcombine_f32(x00, x10);
624 #else
625  return F128::LoadA16(F128::v0010_);
626 #endif
627 }
628 
629 NLIB_M(f128) F128::Set0001() NLIB_NOEXCEPT {
630 #ifdef NLIB_F128_SIMD_NOUSE
631  f128 ret;
632  ret.vec.v[0] = 0;
633  ret.vec.v[1] = 0;
634  ret.vec.v[2] = 0;
635  ret.vec.v[3] = 1.f;
636  return ret;
637 #elif defined(NLIB_NEON)
638  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
639  float32x2_t x00 = vcreate_f32(0ULL);
640  return vcombine_f32(x00, x01);
641 #else
642  return F128::LoadA16(F128::v0001_);
643 #endif
644 }
645 
646 template<size_t N>
647 // r = value, r[N] = 0.f
648 NLIB_M(f128) F128::SetZeroToLane(f128arg value) NLIB_NOEXCEPT {
649  NLIB_STATIC_ASSERT(N < 4);
650 #ifdef NLIB_F128_SIMD_NOUSE
651  f128 ret = value;
652  ret.vec.v[N] = 0.f;
653  return ret;
654 #elif defined(NLIB_SSE41)
655  return _mm_insert_ps(value, value, 1 << N);
656 #elif defined(NLIB_NEON)
657  return F128::Permute < N == 0 ? 4 : 0, N == 1 ? 5 : 1, N == 2 ? 6 : 2,
658  N == 3 ? 7 : 3 > (value, vdupq_n_f32(0.f));
659  // return vsetq_lane_f32(0.f, value, N);
660 #elif defined(CAFE)
661  f128 ret = value;
662  ret.vec.ps[N / 2][N % 2] = 0.f;
663  return ret;
664 #endif
665 }
666 
667 // r[i] = 1.f
668 NLIB_M(f128) F128::SetOne() NLIB_NOEXCEPT {
669  return F128::SetValue(1.f, each_float);
670 }
671 
672 // r[i] = -1.f
673 NLIB_M(f128) F128::SetNegativeOne() NLIB_NOEXCEPT {
674  return F128::SetValue(-1.f, each_float);
675 }
676 
677 // r[i] = 1.0e-7f
678 NLIB_M(f128) F128::SetEpsilon() NLIB_NOEXCEPT {
679  return F128::SetValue(1.0e-7f, each_float);
680 }
681 
682 // r[i] = 0x7F800000U
683 NLIB_M(f128) F128::SetInfinity() NLIB_NOEXCEPT {
684  return F128::SetValue(0x7F800000U, each_uint32);
685 }
686 
687 // r[i] = 0x7FC00000U
688 NLIB_M(f128) F128::SetNaN() NLIB_NOEXCEPT {
689  return F128::SetValue(0x7FC00000U, each_uint32);
690 }
691 
692 // r[i] = -0.f(0x80000000U)
693 NLIB_M(f128) F128::SetSignMask() NLIB_NOEXCEPT {
694  return F128::SetValue(-0.f, each_float);
695 }
696 
697 // r[i] = p[i], p is 16 bytes aligned
698 NLIB_M(f128) F128::LoadA16(const float* p) NLIB_NOEXCEPT {
699 #ifdef NLIB_F128_SIMD_NOUSE
700  f128 ret;
701  ret.vec.v[0] = p[0];
702  ret.vec.v[1] = p[1];
703  ret.vec.v[2] = p[2];
704  ret.vec.v[3] = p[3];
705  return ret;
706 #elif defined(NLIB_SSE41)
707  return _mm_load_ps(p);
708 #elif defined(NLIB_NEON)
709  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
710  uint64x2_t val = vld1q_u64(tmp);
711  return vreinterpretq_f32_u64(val);
712 #elif defined(CAFE)
713  f128 ret;
714  ret.vec.ps[0][0] = p[0];
715  ret.vec.ps[0][1] = p[1];
716  ret.vec.ps[1][0] = p[2];
717  ret.vec.ps[1][1] = p[3];
718  return ret;
719 #endif
720 }
721 
722 // r[i] = p[i], p is 4 bytes aligned
723 NLIB_M(f128) F128::LoadA4(const float* p) NLIB_NOEXCEPT {
724 #ifdef NLIB_F128_SIMD_NOUSE
725  return LoadA16(p);
726 #elif defined(NLIB_SSE41)
727  return _mm_loadu_ps(p);
728 #elif defined(NLIB_NEON)
729  return vld1q_f32(p);
730 #elif defined(CAFE)
731  f128 ret;
732  ret.vec.ps[0][0] = p[0];
733  ret.vec.ps[0][1] = p[1];
734  ret.vec.ps[1][0] = p[2];
735  ret.vec.ps[1][1] = p[3];
736  return ret;
737 #endif
738 }
739 
740 // r[i] = p[i], p is 8 bytes aligned
741 NLIB_M(f128) F128::LoadA8(const float* p) NLIB_NOEXCEPT {
742 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
743  const uint64_t* tmp = reinterpret_cast<const uint64_t*>(p);
744  uint64x2_t val = vld1q_u64(tmp);
745  return vreinterpretq_f32_u64(val);
746 #else
747  return LoadA4(p);
748 #endif
749 }
750 
751 // r[i] = p[i], p is 16 bytes aligned
752 NLIB_M(f128) F128::LoadA16(uintptr_t p) NLIB_NOEXCEPT {
753  return LoadA16(reinterpret_cast<const float*>(p));
754 }
755 
756 // r[i] = p[i], p is 8 bytes aligned
757 NLIB_M(f128) F128::LoadA8(uintptr_t p) NLIB_NOEXCEPT {
758  return LoadA8(reinterpret_cast<const float*>(p));
759 }
760 
761 // r[i] = p[i], p is 4 bytes aligned
762 NLIB_M(f128) F128::LoadA4(uintptr_t p) NLIB_NOEXCEPT {
763  return LoadA4(reinterpret_cast<const float*>(p));
764 }
765 
766 // r[i] = p[i], p is 16 bytes aligned
767 NLIB_M(f128) F128::LoadA16(intptr_t p) NLIB_NOEXCEPT {
768  return LoadA16(reinterpret_cast<const float*>(p));
769 }
770 
771 // r[i] = p[i], p is 8 bytes aligned
772 NLIB_M(f128) F128::LoadA8(intptr_t p) NLIB_NOEXCEPT {
773  return LoadA8(reinterpret_cast<const float*>(p));
774 }
775 
776 // r[i] = p[i], p is 4 bytes aligned
777 NLIB_M(f128) F128::LoadA4(intptr_t p) NLIB_NOEXCEPT {
778  return LoadA4(reinterpret_cast<const float*>(p));
779 }
780 
781 // p[i] = value[i], p is 16 bytes aligned
782 NLIB_M(void) F128::StoreA16(float* p, f128arg value) NLIB_NOEXCEPT {
783 #ifdef NLIB_F128_SIMD_NOUSE
784  p[0] = value.vec.v[0];
785  p[1] = value.vec.v[1];
786  p[2] = value.vec.v[2];
787  p[3] = value.vec.v[3];
788 #elif defined(NLIB_SSE41)
789  _mm_store_ps(p, value);
790 #elif defined(NLIB_NEON)
791  uint64x2_t tmp = vreinterpretq_u64_f32(value);
792  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
793 #elif defined(CAFE)
794  p[0] = value.vec.ps[0][0];
795  p[1] = value.vec.ps[0][1];
796  p[2] = value.vec.ps[1][0];
797  p[3] = value.vec.ps[1][1];
798 #endif
799 }
800 
801 // p[i] = value[i], p is 4 bytes aligned
802 NLIB_M(void) F128::StoreA4(float* p, f128arg value) NLIB_NOEXCEPT {
803 #ifdef NLIB_F128_SIMD_NOUSE
804  StoreA16(p, value);
805 #elif defined(NLIB_SSE41)
806  _mm_storeu_ps(p, value);
807 #elif defined(NLIB_NEON)
808  vst1q_f32(p, value);
809 #elif defined(CAFE)
810  p[0] = value.vec.ps[0][0];
811  p[1] = value.vec.ps[0][1];
812  p[2] = value.vec.ps[1][0];
813  p[3] = value.vec.ps[1][1];
814 #endif
815 }
816 
817 // p[i] = value[i], p is 8 bytes aligned
818 NLIB_M(void) F128::StoreA8(float* p, f128arg value) NLIB_NOEXCEPT {
819 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
820  uint64x2_t tmp = vreinterpretq_u64_f32(value);
821  vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
822 #else
823  StoreA4(p, value);
824 #endif
825 }
826 
827 // p[i] = value[i], p is 16 bytes aligned
828 NLIB_M(void) F128::StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
829  StoreA16(reinterpret_cast<float*>(p), value);
830 }
831 
832 // p[i] = value[i], p is 8 bytes aligned
833 NLIB_M(void) F128::StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
834  StoreA8(reinterpret_cast<float*>(p), value);
835 }
836 
837 // p[i] = value[i], p is 4 bytes aligned
838 NLIB_M(void) F128::StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
839  StoreA4(reinterpret_cast<float*>(p), value);
840 }
841 
842 // p[i] = value[i], p is 16 bytes aligned
843 NLIB_M(void) F128::StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT {
844  StoreA16(reinterpret_cast<float*>(p), value);
845 }
846 
847 // p[i] = value[i], p is 8 bytes aligned
848 NLIB_M(void) F128::StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
849  StoreA8(reinterpret_cast<float*>(p), value);
850 }
851 
852 // p[i] = value[i], p is 4 bytes aligned
853 NLIB_M(void) F128::StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
854  StoreA4(reinterpret_cast<float*>(p), value);
855 }
856 
857 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
858 NLIB_M(void) F128::StoreLoA8(float* p, f128arg value) NLIB_NOEXCEPT {
859 #ifdef NLIB_F128_SIMD_NOUSE
860  p[0] = value.vec.v[0];
861  p[1] = value.vec.v[1];
862 #elif defined(NLIB_SSE41)
863  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
864 #elif defined(NLIB_NEON)
865  uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
866  vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
867 #elif defined(CAFE)
868  p[0] = value.vec.ps[0][0];
869  p[1] = value.vec.ps[0][1];
870 #endif
871 }
872 
873 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
874 NLIB_M(void) F128::StoreLoA4(float* p, f128arg value) NLIB_NOEXCEPT {
875 #ifdef NLIB_F128_SIMD_NOUSE
876  p[0] = value.vec.v[0];
877  p[1] = value.vec.v[1];
878 #elif defined(NLIB_SSE41)
879  _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
880 #elif defined(NLIB_NEON)
881  float32x2_t tmp = vget_low_f32(value);
882  vst1_f32(p, tmp);
883 #elif defined(CAFE)
884  p[0] = value.vec.ps[0][0];
885  p[1] = value.vec.ps[0][1];
886 #endif
887 }
888 
889 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
890 NLIB_M(void) F128::StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
891  StoreLoA8(reinterpret_cast<float*>(p), value);
892 }
893 
894 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
895 NLIB_M(void) F128::StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
896  StoreLoA4(reinterpret_cast<float*>(p), value);
897 }
898 
899 // p[0] = value[0], p[1] = value[1], p is 8 bytes aligned
900 NLIB_M(void) F128::StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
901  StoreLoA8(reinterpret_cast<float*>(p), value);
902 }
903 
904 // p[0] = value[0], p[1] = value[1], p is 4 bytes aligned
905 NLIB_M(void) F128::StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
906  StoreLoA4(reinterpret_cast<float*>(p), value);
907 }
908 
909 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
910 NLIB_M(void) F128::StoreHiA8(float* p, f128arg value) NLIB_NOEXCEPT {
911 #ifdef NLIB_F128_SIMD_NOUSE
912  p[0] = value.vec.v[2];
913  p[1] = value.vec.v[3];
914 #elif defined(NLIB_SSE41)
915  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
916 #elif defined(NLIB_NEON)
917  vst1_f32(p, vget_high_f32(value));
918 #elif defined(CAFE)
919  p[0] = value.vec.ps[1][0];
920  p[1] = value.vec.ps[1][1];
921 #endif
922 }
923 
924 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
925 NLIB_M(void) F128::StoreHiA4(float* p, f128arg value) NLIB_NOEXCEPT {
926 #ifdef NLIB_F128_SIMD_NOUSE
927  p[0] = value.vec.v[2];
928  p[1] = value.vec.v[3];
929 #elif defined(NLIB_SSE41)
930  _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
931 #elif defined(NLIB_NEON)
932  float32x2_t tmp = vget_high_f32(value);
933  vst1_f32(p, tmp);
934 #elif defined(CAFE)
935  p[0] = value.vec.ps[1][0];
936  p[1] = value.vec.ps[1][1];
937 #endif
938 }
939 
940 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
941 NLIB_M(void) F128::StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
942  StoreHiA8(reinterpret_cast<float*>(p), value);
943 }
944 
945 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
946 NLIB_M(void) F128::StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
947  StoreHiA4(reinterpret_cast<float*>(p), value);
948 }
949 
950 // p[0] = value[2], p[1] = value[3], p is 8 bytes aligned
951 NLIB_M(void) F128::StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
952  StoreHiA8(reinterpret_cast<float*>(p), value);
953 }
954 
955 // p[0] = value[2], p[1] = value[3], p is 4 bytes aligned
956 NLIB_M(void) F128::StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
957  StoreHiA4(reinterpret_cast<float*>(p), value);
958 }
959 
960 // r[i] = fabs(value[i])
961 NLIB_M(f128) F128::Abs(f128arg value) NLIB_NOEXCEPT {
962 #ifdef NLIB_F128_SIMD_NOUSE
963  f128 ret;
964  ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
965  ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
966  ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
967  ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
968  return ret;
969 #elif defined(NLIB_NEON)
970  return vabsq_f32(value);
971 #elif defined(NLIB_SSE41)
972  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
973  return _mm_andnot_ps(signmask, value);
974 #elif defined(CAFE)
975  f128 ret;
976  ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
977  ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
978  return ret;
979 #endif
980 }
981 
982 // r[i] = mask[i] ? a[i] : b[i]
983 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT {
984 #ifdef NLIB_F128_SIMD_NOUSE
985  f128 result;
986  result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
987  result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
988  result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
989  result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
990  return result;
991 #elif defined(NLIB_SSE41)
992  return _mm_blendv_ps(b, a, mask);
993 #elif defined(NLIB_NEON)
994  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
995 #elif defined(CAFE)
996  // avoid NaN
997  f128 mask_ = mask;
998  mask_.vec.u[0] &= 0xFF7FFFFFUL;
999  mask_.vec.u[1] &= 0xFF7FFFFFUL;
1000  mask_.vec.u[2] &= 0xFF7FFFFFUL;
1001  mask_.vec.u[3] &= 0xFF7FFFFFUL;
1002  // mask_ < 0 ? a : b
1003  f128 ret;
1004  ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
1005  ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
1006  return ret;
1007 #endif
1008 }
1009 
1010 /*
1011 NLIB_M(f128) F128::ConvertFromFp16Lo(f128arg value) NLIB_NOEXCEPT {
1012 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1013  float16x4_t lo = vget_low_f16(vreinterpretq_f16_f32(value));
1014  return vcvt_f32_f16(lo);
1015 #else
1016  // _mm_cvtph_ps
1017  (void)value;
1018  return F128::SetZero();
1019 #endif
1020 }
1021 
1022 NLIB_M(f128) F128::ConvertFromFp16High(f128arg value) NLIB_NOEXCEPT {
1023 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1024 #ifdef __aarch64__
1025  return vcvt_high_f32_f16(value);
1026 #else
1027  float16x4_t hi = vget_high_f16(vreinterpretq_f16_f32(value));
1028  return vcvt_f32_f16(hi);
1029 #endif
1030 #else
1031  // _mm_cvtph_ps
1032  (void)value;
1033  return F128::SetZero();
1034 #endif
1035 }
1036 
1037 NLIB_M(f128) F128::ConvertToFp16(f128arg a, f128arg b) NLIB_NOEXCEPT {
1038 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1039 #ifdef __aarch64__
1040  float16x4_t lo = vcvt_f16_f32(a);
1041  float16x8_t x = vcvt_high_f16_f32(lo, b);
1042  return vreinterpretq_f32_f16(x);
1043 #else
1044  float16x4_t lo = vcvt_f16_f32(a);
1045  float16x4_t hi = vcvt_f16_f32(b);
1046  return vreinterpretq_f32_f16(vcombine_f16(lo, hi));
1047 #endif
1048 #else
1049  // _mm_cvtps_ph
1050  (void)a;
1051  (void)b;
1052  return F128::SetZero();
1053 #endif
1054 }
1055 */
1056 
1057 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE)
1058 // r[i] = static_cast<float>(value[i])
1059 NLIB_M(f128) F128::ConvertFromI128(i128 value) NLIB_NOEXCEPT {
1060 #if defined(NLIB_SSE41)
1061  return _mm_cvtepi32_ps(value);
1062 #elif defined(NLIB_NEON)
1063  return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1064 #endif
1065 }
1066 
1067 // r[i] = *reinterpret_cast<float*>(&value[i])
1068 NLIB_M(f128) F128::CastFromI128(i128 value) NLIB_NOEXCEPT {
1069 #if defined(NLIB_SSE41)
1070  return _mm_castsi128_ps(value);
1071 #elif defined(NLIB_NEON)
1072  return vreinterpretq_f32_s8(value);
1073 #endif
1074 }
1075 
1076 // r[i] = static_cast<int>(roundf(value[i]))
1077 NLIB_M(i128) F128::ConvertToI128Round(f128 value) NLIB_NOEXCEPT {
1078 #if defined(NLIB_SSE41)
1079  return _mm_cvtps_epi32(value);
1080 #elif defined(NLIB_NEON)
1081  uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1082  uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1083  uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1084  w = vorrq_u32(w, half);
1085  return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1086 #endif
1087 }
1088 
1089 NLIB_M(i128) F128::ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT {
1090 #if defined(NLIB_SSE41)
1091  return _mm_cvttps_epi32(value);
1092 #elif defined(NLIB_NEON)
1093  return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1094 #endif
1095 }
1096 
1097 // r[i] = *reinterpret_cast<int*>(&value[i])
1098 NLIB_M(i128) F128::CastToI128(f128 value) NLIB_NOEXCEPT {
1099 #if defined(NLIB_SSE41)
1100  return _mm_castps_si128(value);
1101 #elif defined(NLIB_NEON)
1102  return vreinterpretq_s8_f32(value);
1103 #endif
1104 }
1105 
1106 template<int N>
1107 NLIB_M(f128)
1108 F128::ConvertFromFixedPoint(i128arg value) NLIB_NOEXCEPT {
1109  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1110 #if defined(NLIB_NEON)
1111  return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1112 #else
1113  f128 f = F128::ConvertFromI128(value);
1114  f128 m = F128::SetValue(((0x7F - N) << 23), each_uint32); // 0.5f^N
1115  return F128::Mult(f, m);
1116 #endif
1117 }
1118 
1119 template<int N>
1120 NLIB_M(i128)
1121 F128::ConvertToFixedPoint(f128arg value) NLIB_NOEXCEPT {
1122  NLIB_STATIC_ASSERT(1 <= N && N <= 32);
1123 #if defined(NLIB_NEON)
1124  return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1125 #else
1126  f128 m = F128::SetValue(((0x7F + N) << 23), each_uint32); // 2.f^N
1127  f128 f = F128::Mult(value, m);
1128  return F128::ConvertToI128Truncate(f);
1129 #endif
1130 }
1131 
1132 #endif
1133 
1134 // r[i] = (a[i] < b[i]) ? 0xFFFFFFFF : 0
1135 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1136 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1137  f128 ret;
1138  ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1139  ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1140  ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1141  ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1142  return ret;
1143 #elif defined(NLIB_SSE41)
1144  return _mm_cmplt_ps(a, b);
1145 #elif defined(NLIB_NEON)
1146  uint32x4_t tmp = vcltq_f32(a, b);
1147  return vreinterpretq_f32_u32(tmp);
1148 #endif
1149 }
1150 
1151 // r[i] = (a[i] <= b[i]) ? 0xFFFFFFFF : 0
1152 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1153 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1154  f128 ret;
1155  ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1156  ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1157  ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1158  ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1159  return ret;
1160 #elif defined(NLIB_SSE41)
1161  return _mm_cmple_ps(a, b);
1162 #elif defined(NLIB_NEON)
1163  uint32x4_t tmp = vcleq_f32(a, b);
1164  return vreinterpretq_f32_u32(tmp);
1165 #endif
1166 }
1167 
1168 // r[i] = (a[i] > b[i]) ? 0xFFFFFFFF : 0
1169 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT {
1170 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1171  f128 ret;
1172  ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1173  ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1174  ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1175  ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1176  return ret;
1177 #elif defined(NLIB_SSE41)
1178  return _mm_cmpgt_ps(a, b);
1179 #elif defined(NLIB_NEON)
1180  uint32x4_t tmp = vcgtq_f32(a, b);
1181  return vreinterpretq_f32_u32(tmp);
1182 #endif
1183 }
1184 
1185 // r[i] = (a[i] >= b[i]) ? 0xFFFFFFFF : 0
1186 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1187 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1188  f128 ret;
1189  ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1190  ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1191  ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1192  ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1193  return ret;
1194 #elif defined(NLIB_SSE41)
1195  return _mm_cmpge_ps(a, b);
1196 #elif defined(NLIB_NEON)
1197  uint32x4_t tmp = vcgeq_f32(a, b);
1198  return vreinterpretq_f32_u32(tmp);
1199 #endif
1200 }
1201 
1202 // r[i] = (a[i] != b[i]) ? 0xFFFFFFFF : 0
1203 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1204 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1205  f128 ret;
1206  ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1207  ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1208  ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1209  ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1210  return ret;
1211 #elif defined(NLIB_SSE41)
1212  return _mm_cmpneq_ps(a, b);
1213 #elif defined(NLIB_NEON)
1214  uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1215  return vreinterpretq_f32_u32(tmp);
1216 #endif
1217 }
1218 
1219 // r[i] = a[i] + b[i]
1220 NLIB_M(f128) F128::Add(f128arg a, f128arg b) NLIB_NOEXCEPT {
1221 #ifdef NLIB_F128_SIMD_NOUSE
1222  f128 ret;
1223  ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1224  ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1225  ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1226  ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1227  return ret;
1228 #elif defined(NLIB_SSE41)
1229  return _mm_add_ps(a, b);
1230 #elif defined(NLIB_NEON)
1231  return vaddq_f32(a, b);
1232 #elif defined(CAFE)
1233  f128 ret;
1234  ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1235  ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1236  return ret;
1237 #endif
1238 }
1239 
1240 // r[i] = a[i] - b[i]
1241 NLIB_M(f128) F128::Sub(f128arg a, f128arg b) NLIB_NOEXCEPT {
1242 #ifdef NLIB_F128_SIMD_NOUSE
1243  f128 ret;
1244  ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1245  ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1246  ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1247  ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1248  return ret;
1249 #elif defined(NLIB_SSE41)
1250  return _mm_sub_ps(a, b);
1251 #elif defined(NLIB_NEON)
1252  return vsubq_f32(a, b);
1253 #elif defined(CAFE)
1254  f128 ret;
1255  ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1256  ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1257  return ret;
1258 #endif
1259 }
1260 
1261 // r[i] = -value[i]
1262 NLIB_M(f128) F128::Negate(f128arg value) NLIB_NOEXCEPT {
1263 #ifdef NLIB_F128_SIMD_NOUSE
1264  f128 ret;
1265  ret.vec.v[0] = -value.vec.v[0];
1266  ret.vec.v[1] = -value.vec.v[1];
1267  ret.vec.v[2] = -value.vec.v[2];
1268  ret.vec.v[3] = -value.vec.v[3];
1269  return ret;
1270 #elif defined(NLIB_NEON)
1271  return vnegq_f32(value);
1272 #elif defined(NLIB_SSE41)
1273  const __m128 signmask = _mm_set1_ps(-0.0f); // 0x80000000
1274  return _mm_xor_ps(signmask, value);
1275 #elif defined(CAFE)
1276  f128 ret;
1277  ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1278  ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1279  return ret;
1280 #endif
1281 }
1282 
1283 // r[i] = a[i] * b[i]
1284 NLIB_M(f128) F128::Mult(f128arg a, f128arg b) NLIB_NOEXCEPT {
1285 #ifdef NLIB_F128_SIMD_NOUSE
1286  f128 ret;
1287  ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1288  ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1289  ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1290  ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1291  return ret;
1292 #elif defined(NLIB_SSE41)
1293  return _mm_mul_ps(a, b);
1294 #elif defined(NLIB_NEON)
1295  return vmulq_f32(a, b);
1296 #elif defined(CAFE)
1297  f128 ret;
1298  ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1299  ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1300  return ret;
1301 #endif
1302 }
1303 
1304 // r[i] = a * b[i]
1305 NLIB_M(f128) F128::Mult(float a, f128arg b) NLIB_NOEXCEPT {
1306 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1307  return vmulq_n_f32(b, a);
1308 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1309  f128 ret;
1310  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1311  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1312  return ret;
1313 #else
1314  return F128::Mult(b, F128::SetValue(a, each_float));
1315 #endif
1316 }
1317 
1318 template<size_t N>
1319 // r[i] = a[N] * b[i]
1320 NLIB_M(f128) F128::Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT {
1321 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1322 #if __aarch64__
1323  return vmulq_laneq_f32(b, a, N);
1324 #else
1325  float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1326  return vmulq_n_f32(b, tmp);
1327 #endif
1328 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1329  float t = a.vec.ps[N / 2][N % 2];
1330  f128 ret;
1331  ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1332  ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1333  return ret;
1334 #else
1335  return F128::Mult(F128::SetValue<N>(a, each_select32), b);
1336 #endif
1337 }
1338 
1339 // r[i] = a[i] / b[i]
1340 NLIB_M(f128) F128::Div(f128arg a, f128arg b) NLIB_NOEXCEPT {
1341 #ifdef NLIB_F128_SIMD_NOUSE
1342  f128 ret;
1343  ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1344  ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1345  ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1346  ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1347  return ret;
1348 #elif defined(NLIB_SSE41)
1349  return _mm_div_ps(a, b);
1350 #elif defined(NLIB_NEON)
1351 #ifdef __aarch64__
1352  return vdivq_f32(a, b);
1353 #else
1354  float32x4_t inv0 = vrecpeq_f32(b);
1355  float32x4_t step0 = vrecpsq_f32(inv0, b);
1356  float32x4_t inv1 = vmulq_f32(step0, inv0);
1357  float32x4_t step1 = vrecpsq_f32(inv1, b);
1358  float32x4_t inv2 = vmulq_f32(step1, inv1);
1359  uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1360  inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1361  return vmulq_f32(a, inv2);
1362 #endif
1363 #elif defined(CAFE)
1364  f128 ret;
1365  ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1366  ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1367  return ret;
1368 #endif
1369 }
1370 
1371 // r[i] = max(a[i], b[i])
1372 NLIB_M(f128) F128::Max(f128arg a, f128arg b) NLIB_NOEXCEPT {
1373 #ifdef NLIB_F128_SIMD_NOUSE
1374  f128 ret;
1375  ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1376  ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1377  ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1378  ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1379  return ret;
1380 #elif defined(NLIB_SSE41)
1381  return _mm_max_ps(a, b);
1382 #elif defined(NLIB_NEON)
1383  return vmaxq_f32(a, b);
1384 #elif defined(CAFE)
1385  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1386  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1387  f128 ret;
1388  ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1389  ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1390  return ret;
1391 #endif
1392 }
1393 
1394 // r[i] = min(a[i], b[i])
1395 NLIB_M(f128) F128::Min(f128arg a, f128arg b) NLIB_NOEXCEPT {
1396 #ifdef NLIB_F128_SIMD_NOUSE
1397  f128 ret;
1398  ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1399  ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1400  ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1401  ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1402  return ret;
1403 #elif defined(NLIB_SSE41)
1404  return _mm_min_ps(a, b);
1405 #elif defined(NLIB_NEON)
1406  return vminq_f32(a, b);
1407 #elif defined(CAFE)
1408  f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1409  f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1410  f128 ret;
1411  ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1412  ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1413  return ret;
1414 #endif
1415 }
1416 
1417 // r[0] = max(a[0], a[1]), r[1] = max(a[2], a[3]), r[2] = max(b[0], b[1]), r[3] = max(b[2], b[3])
1418 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT {
1419 #ifdef NLIB_F128_SIMD_NOUSE
1420  f128 ret;
1421  ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1422  ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1423  ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1424  ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1425  return ret;
1426 #elif defined(NLIB_SSE41)
1427  f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1428  f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1429  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1430 #elif defined(NLIB_NEON)
1431 #ifdef __aarch64__
1432  return vpmaxq_f32(a, b);
1433 #else
1434  float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1435  float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1436  return vcombine_f32(rl, rh);
1437 #endif
1438 #elif defined(CAFE)
1439  f32x2 v02, v13, cmp;
1440  f128 ret;
1441  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1442  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1443  cmp = __PS_SUB(v02, v13);
1444  ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1445  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1446  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1447  cmp = __PS_SUB(v02, v13);
1448  ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1449  return ret;
1450 #endif
1451 }
1452 
1453 // r[0] = min(a[0], a[1]), r[1] = min(a[2], a[3]), r[2] = min(b[0], b[1]), r[3] = min(b[2], b[3])
1454 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT {
1455 #ifdef NLIB_F128_SIMD_NOUSE
1456  f128 ret;
1457  ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1458  ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1459  ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1460  ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1461  return ret;
1462 #elif defined(NLIB_SSE41)
1463  f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1464  f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1465  return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1466 #elif defined(NLIB_NEON)
1467 #ifdef __aarch64__
1468  return vpminq_f32(a, b);
1469 #else
1470  float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1471  float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1472  return vcombine_f32(rl, rh);
1473 #endif
1474 #elif defined(CAFE)
1475  f32x2 v02, v13, cmp;
1476  f128 ret;
1477  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1478  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1479  cmp = __PS_SUB(v02, v13);
1480  ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1481  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1482  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1483  cmp = __PS_SUB(v02, v13);
1484  ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1485  return ret;
1486 #endif
1487 }
1488 
1489 // r[0] = a[0] + a[1], r[1] = a[2] + a[3], ...
1490 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT {
1491 #ifdef NLIB_F128_SIMD_NOUSE
1492  f128 ret;
1493  ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1494  ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1495  ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1496  ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1497  return ret;
1498 #elif defined(NLIB_SSE41)
1499  return _mm_hadd_ps(a, b);
1500 #elif defined(NLIB_NEON)
1501 #ifdef __aarch64__
1502  return vpaddq_f32(a, b);
1503 #else
1504  float32x2_t al = vget_low_f32(a);
1505  float32x2_t ah = vget_high_f32(a);
1506  float32x2_t l = vpadd_f32(al, ah);
1507 
1508  float32x2_t bl = vget_low_f32(b);
1509  float32x2_t bh = vget_high_f32(b);
1510  float32x2_t h = vpadd_f32(bl, bh);
1511  return vcombine_f32(l, h);
1512 #endif
1513 #elif defined(CAFE)
1514  f32x2 v02, v13, cmp;
1515  f128 ret;
1516  v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1517  v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1518  ret.vec.ps[0] = __PS_ADD(v02, v13);
1519  v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1520  v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1521  ret.vec.ps[1] = __PS_ADD(v02, v13);
1522  return ret;
1523 #endif
1524 }
1525 
1526 // r[i] = fabs(a[i] - b[i])
1527 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT {
1528 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1529  return vabdq_f32(a, b);
1530 #else
1531  return F128::Abs(F128::Sub(a, b));
1532 #endif
1533 }
1534 
1535 // r[i] = c[i] + a[i] * b[i]
1536 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1537 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1538 #if __aarch64__
1539  return vfmaq_f32(c, a, b);
1540 #else
1541  return vmlaq_f32(c, a, b);
1542 #endif
1543 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1544  f128 ret;
1545  ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1546  ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1547  return ret;
1548 #else
1549  return F128::Add(c, F128::Mult(a, b));
1550 #endif
1551 }
1552 
1553 // r[i] = c[i] + a * b[i]
1554 NLIB_M(f128) F128::MultAdd(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1555 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1556 #if __aarch64__
1557  return vfmaq_n_f32(c, b, a);
1558 #else
1559  return vmlaq_n_f32(c, b, a);
1560 #endif
1561 #else
1562  return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1563 #endif
1564 }
1565 
1566 template<size_t N>
1567 // r[i] = c[i] + a[N] * b[i]
1568 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c, each_select32_tag) NLIB_NOEXCEPT {
1569  NLIB_STATIC_ASSERT(N < 4);
1570 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1571 #if __aarch64__
1572  return vfmaq_laneq_f32(c, b, a, N);
1573 #else
1574  return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1575 #endif
1576 #else
1577  return F128::MultAdd(F128::SetValue<N>(a, each_select32), b, c);
1578 #endif
1579 }
1580 
1581 // r[i] = c[i] - a[i] * b[i]
1582 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1583 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1584 #if __aarch64__
1585  return vfmsq_f32(c, a, b);
1586 #else
1587  return vmlsq_f32(c, a, b);
1588 #endif
1589 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
1590  f128 ret;
1591  ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1592  ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1593  return ret;
1594 #else
1595  return F128::Sub(c, F128::Mult(a, b));
1596 #endif
1597 }
1598 
1599 // r[i] = c[i] - a * b[i]
1600 NLIB_M(f128) F128::MultSub(float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1601 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1602 #if __aarch64__
1603  return vfmsq_n_f32(c, b, a);
1604 #else
1605  return vmlsq_n_f32(c, b, a);
1606 #endif
1607 #else
1608  return F128::MultSub(F128::SetValue(a, each_float), b, c);
1609 #endif
1610 }
1611 
1612 template<size_t N>
1613 // r[i] = c[i] - a[N] * b[i]
1614 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c, each_select32_tag) NLIB_NOEXCEPT {
1615  NLIB_STATIC_ASSERT(N < 4);
1616 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1617 #ifdef __arch64__
1618  return vfmsq_laneq_f32(c, b, a, N);
1619 #else
1620  return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1621 #endif
1622 #else
1623  return F128::MultSub(F128::SetValue<N>(a, each_select32), b, c);
1624 #endif
1625 }
1626 
1627 // r[i] = a[i] + t[i] * (b[i] - a[i])
1628 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT {
1629  // a + t * (b - a)
1630  return F128::MultAdd(t, F128::Sub(b, a), a);
1631 }
1632 
1633 // r[i] = a[i] & b[i]
1634 NLIB_M(f128) F128::And(f128arg a, f128arg b) NLIB_NOEXCEPT {
1635 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1636  f128 ret;
1637  ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1638  ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1639  ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1640  ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1641  return ret;
1642 #elif defined(NLIB_SSE41)
1643  return _mm_and_ps(a, b);
1644 #elif defined(NLIB_NEON)
1645  uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1646  return vreinterpretq_f32_u32(tmp);
1647 #endif
1648 }
1649 
1650 // -pi <= angle1, angle2, r < pi
1651 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1652  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1653  // -pi <= ret < pi
1654  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1655  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1656 
1657  f128 sum = F128::Add(angle1, angle2);
1658  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1659  f128 ofs = F128::And(cond, pi_dbl);
1660  f128 result = F128::Add(sum, ofs);
1661  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1662  ofs = F128::And(cond, pi_dbl);
1663  return F128::Sub(result, ofs);
1664 }
1665 
1666 // -pi <= angle1, angle2, r < pi
1667 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1668  // -pi <= angle1 < pi, -2pi <= angle2 <= 2pi
1669  // -pi <= ret < pi
1670  f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1671  f128 pi_dbl = F128::SetValue<2>(pi_pi2, each_select32);
1672 
1673  f128 sum = F128::Sub(angle1, angle2);
1674  f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2, each_select32));
1675  f128 ofs = F128::And(cond, pi_dbl);
1676  f128 result = F128::Add(sum, ofs);
1677  cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2, each_select32));
1678  ofs = F128::And(cond, pi_dbl);
1679  return F128::Sub(result, ofs);
1680 }
1681 
1682 // ( 2 1 -2 1) (p0)
1683 // (t^3 t^2 t 1) (-3 -2 3 -1) (v0)
1684 // ( 0 1 0 0) (p1)
1685 // ( 1 0 0 0) (v1)
1686 NLIB_M2(f128)
1687 F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t) NLIB_NOEXCEPT {
1688  // (2 * p0 + v0 - 2 * p1 + v1) * t^3 + (-3 * p0 - 2 * v0 + 3 * p1 - v1) * t^2
1689  // + v0 * t + p0
1690  // ==
1691  // (2 * t^3 - 3 * t^2 + 1) * p0 + (t^3 - 2 * t^2 + t) * v0
1692  // + (-2 * t^3 + 3 * t^2) * p1 + (t^3 - t^2) * v1
1693  f128 tt = F128::Mult(t, t);
1694  f128 ttt = F128::Mult(tt, t);
1695 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1696  f128 hermite_R0 =
1697  vcombine_f32(vcreate_f32(0x3F80000040000000ULL), vcreate_f32(0x3F800000C0000000ULL));
1698  f128 hermite_R1 =
1699  vcombine_f32(vcreate_f32(0xC0000000C0400000ULL), vcreate_f32(0xBF80000040400000ULL));
1700 #else
1701  f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1702  f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1703 #endif
1704 
1705  ttt = F128::Mult(ttt, hermite_R0);
1706  ttt = F128::MultAdd(tt, hermite_R1, ttt);
1707  ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1708  ttt = F128::Add(ttt, F128::Set1000());
1709 
1710  // vec(ttt) * mtx(p0, v0, p1, v1)
1711  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1712  result = F128::MultAdd<1>(ttt, v0, result, each_select32);
1713  result = F128::MultAdd<2>(ttt, p1, result, each_select32);
1714  result = F128::MultAdd<3>(ttt, v1, result, each_select32);
1715  return result;
1716 }
1717 
1718 // (-1 3 -3 1) (p0)
1719 // 0.5 * (t^3 t^2 t 1) ( 2 -5 4 -1) (p1)
1720 // (-1 0 1 0) (p2)
1721 // ( 0 2 0 0) (p3)
1722 NLIB_M2(f128)
1723 F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t) NLIB_NOEXCEPT {
1724  f128 tt = F128::Mult(t, t);
1725  f128 ttt = F128::Mult(tt, t);
1726 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1727  f128 catmull_R0 =
1728  vcombine_f32(vcreate_f32(0x40400000BF800000ULL), vcreate_f32(0x3F800000C0400000ULL));
1729  f128 catmull_R1 =
1730  vcombine_f32(vcreate_f32(0xC0A0000040000000ULL), vcreate_f32(0xBF80000040800000ULL));
1731  f128 catmull_R2 =
1732  vcombine_f32(vcreate_f32(0x00000000BF800000ULL), vcreate_f32(0x000000003F800000ULL));
1733 #else
1734  f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1735  f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1736  f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1737 #endif
1738  ttt = F128::Mult(ttt, catmull_R0);
1739  ttt = F128::MultAdd(tt, catmull_R1, ttt);
1740  ttt = F128::MultAdd(t, catmull_R2, ttt);
1741  ttt = F128::Add(ttt, F128::Set0100());
1742 
1743  // vec(ttt) * mtx(p0, p1, p2, p3)
1744  f128 result = F128::Mult<0>(ttt, p0, each_select32);
1745  result = F128::MultAdd<1>(ttt, p1, result, each_select32);
1746  result = F128::MultAdd<2>(ttt, p2, result, each_select32);
1747  result = F128::MultAdd<3>(ttt, p3, result, each_select32);
1748 
1749  return result;
1750 }
1751 
1752 // p0 + f * (p1 - p0) + g * (p2 - p0)
1753 NLIB_M(f128)
1754 F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g) NLIB_NOEXCEPT {
1755  f128 p1p0 = F128::Sub(p1, p0);
1756  f128 p2p0 = F128::Sub(p2, p0);
1757  f128 tmp = F128::MultAdd(f, p1p0, p0);
1758  return F128::MultAdd(g, p2p0, tmp);
1759 }
1760 
1761 // r[i] = a[i] | b[i]
1762 NLIB_M(f128) F128::Or(f128arg a, f128arg b) NLIB_NOEXCEPT {
1763 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1764  f128 ret;
1765  ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1766  ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1767  ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1768  ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1769  return ret;
1770 #elif defined(NLIB_SSE41)
1771  return _mm_or_ps(a, b);
1772 #elif defined(NLIB_NEON)
1773  uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1774  return vreinterpretq_f32_u32(tmp);
1775 #endif
1776 }
1777 
1778 // r[i] = a[i] ^ b[i]
1779 NLIB_M(f128) F128::Xor(f128arg a, f128arg b) NLIB_NOEXCEPT {
1780 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1781  f128 ret;
1782  ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1783  ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1784  ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1785  ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1786  return ret;
1787 #elif defined(NLIB_SSE41)
1788  return _mm_xor_ps(a, b);
1789 #elif defined(NLIB_NEON)
1790  uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1791  return vreinterpretq_f32_u32(tmp);
1792 #endif
1793 }
1794 
1795 // r[i] = ~a[i]
1796 NLIB_M(f128) F128::Not(f128arg a) NLIB_NOEXCEPT {
1797 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1798  f128 ret;
1799  ret.vec.u[0] = ~a.vec.u[0];
1800  ret.vec.u[1] = ~a.vec.u[1];
1801  ret.vec.u[2] = ~a.vec.u[2];
1802  ret.vec.u[3] = ~a.vec.u[3];
1803  return ret;
1804 #elif defined(NLIB_SSE41)
1805  return _mm_andnot_ps(a, F128::CmpEq(a, a));
1806 #elif defined(NLIB_NEON)
1807  uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1808  return vreinterpretq_f32_u32(tmp);
1809 #endif
1810 }
1811 
1812 // r[i] = ~a[i] & b[i]
1813 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1814 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1815  f128 ret;
1816  ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1817  ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1818  ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1819  ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1820  return ret;
1821 #elif defined(NLIB_SSE41)
1822  return _mm_andnot_ps(a, b);
1823 #elif defined(NLIB_NEON)
1824  uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1825  return vreinterpretq_f32_u32(tmp);
1826 #endif
1827 }
1828 
1829 // r[i] = ~a[i] | b[i]
1830 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1831 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1832  f128 ret;
1833  ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1834  ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1835  ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1836  ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1837  return ret;
1838 #elif defined(NLIB_SSE41)
1839  return _mm_or_ps(F128::Not(a), b);
1840 #elif defined(NLIB_NEON)
1841  uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1842  return vreinterpretq_f32_u32(tmp);
1843 #endif
1844 }
1845 
1846 // r[i] = (a[i] == b[i]) ? 0xFFFFFFFF : 0
1847 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT {
1848 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
1849  f128 ret;
1850  ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1851  ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1852  ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1853  ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1854  return ret;
1855 #elif defined(NLIB_SSE41)
1856  return _mm_cmpeq_ps(a, b);
1857 #elif defined(NLIB_NEON)
1858  uint32x4_t tmp = vceqq_f32(a, b);
1859  return vreinterpretq_f32_u32(tmp);
1860 #endif
1861 }
1862 
1863 // r[i] = (absf(a[i] - b[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1864 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT {
1865  f128 tmp = F128::AbsDiff(a, b);
1866  return F128::CmpLe(tmp, eps);
1867 }
1868 
1869 // r[i] = clamp(value[i], min[i], max[i])
1870 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT {
1871  return F128::Min(max, F128::Max(min, value));
1872 }
1873 
1874 // r[i] = absf(value[i]) <= bounds[i] ? 0xFFFFFFFF : 0
1875 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT {
1876 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1877  uint32x4_t tmp = vcaleq_f32(value, bounds);
1878  return vreinterpretq_f32_u32(tmp);
1879 #else
1880  return F128::CmpLe(F128::Abs(value), bounds);
1881 #endif
1882 }
1883 
1884 NLIB_M(f128) F128::CmpEqZero(f128arg value) NLIB_NOEXCEPT {
1885 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1886  return vreinterpretq_f32_u32(vceqzq_f32(value));
1887 #else
1888  return F128::CmpEq(value, F128::SetZero());
1889 #endif
1890 }
1891 
1892 NLIB_M(f128) F128::CmpLtZero(f128arg value) NLIB_NOEXCEPT {
1893 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1894  return vreinterpretq_f32_u32(vcltzq_f32(value));
1895 #else
1896  return F128::CmpLt(value, F128::SetZero());
1897 #endif
1898 }
1899 
1900 NLIB_M(f128) F128::CmpLeZero(f128arg value) NLIB_NOEXCEPT {
1901 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1902  return vreinterpretq_f32_u32(vclezq_f32(value));
1903 #else
1904  return F128::CmpLe(value, F128::SetZero());
1905 #endif
1906 }
1907 
1908 NLIB_M(f128) F128::CmpGtZero(f128arg value) NLIB_NOEXCEPT {
1909 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1910  return vreinterpretq_f32_u32(vcgtzq_f32(value));
1911 #else
1912  return F128::CmpGt(value, F128::SetZero());
1913 #endif
1914 }
1915 
1916 NLIB_M(f128) F128::CmpGeZero(f128arg value) NLIB_NOEXCEPT {
1917 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1918  return vreinterpretq_f32_u32(vcgezq_f32(value));
1919 #else
1920  return F128::CmpGe(value, F128::SetZero());
1921 #endif
1922 }
1923 
1924 NLIB_M(f128) F128::CmpNeZero(f128arg value) NLIB_NOEXCEPT {
1925 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE)
1926  return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1927 #else
1928  return F128::CmpNe(value, F128::SetZero());
1929 #endif
1930 }
1931 
1932 // r[i] = (absf(value[i]) <= eps[i]) ? 0xFFFFFFFF : 0
1933 NLIB_M(f128) F128::CmpNearEqZero(f128arg value, f128arg eps) NLIB_NOEXCEPT {
1934  f128 tmp = F128::Abs(value);
1935  return F128::CmpLe(tmp, eps);
1936 }
1937 
1938 // r[i] = 1.f / value[i] with higher precision, set infinity if value[i] == 0
1939 NLIB_M2(f128) F128::Recp(f128arg value) NLIB_NOEXCEPT {
1940 #ifdef NLIB_F128_SIMD_NOUSE
1941  f128 ret;
1942  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1943  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1944  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1945  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1946  return ret;
1947 #elif defined(NLIB_SSE41)
1948  return _mm_div_ps(F128::SetOne(), value);
1949 #elif defined(NLIB_NEON)
1950 #ifdef __aarch64__
1951  return vdivq_f32(vdupq_n_f32(1.f), value);
1952 #else
1953  float32x4_t x;
1954  x = vrecpeq_f32(value);
1955  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x1 = x0 * (2 - x0 * value)
1956  x = vmulq_f32(x, vrecpsq_f32(x, value)); // x2 = x1 * (2 - x1 * value)
1957  uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1958  float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1959  return result;
1960 #endif
1961 #elif defined(CAFE)
1962  return F128::Div(F128::SetOne(), value);
1963 #endif
1964 }
1965 
1966 // r[i] = 1.f / value[i] with lower precision
1967 NLIB_M(f128) F128::RecpEst(f128arg value) NLIB_NOEXCEPT {
1968 #ifdef NLIB_F128_SIMD_NOUSE
1969  f128 ret;
1970  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1971  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1972  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1973  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1974  return ret;
1975 #elif defined(NLIB_SSE41)
1976  return _mm_rcp_ps(value);
1977 #elif defined(NLIB_NEON)
1978  return vrecpeq_f32(value);
1979 #elif defined(CAFE)
1980  f128 ret;
1981  ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1982  ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1983  return ret;
1984 #endif
1985 }
1986 
1987 // r[i] = sqrtf(value[i]) with higher precision
1988 NLIB_M2(f128) F128::Sqrt(f128arg value) NLIB_NOEXCEPT {
1989 #ifdef NLIB_F128_SIMD_NOUSE
1990  f128 ret;
1991  ret.vec.v[0] = sqrtf(value.vec.v[0]);
1992  ret.vec.v[1] = sqrtf(value.vec.v[1]);
1993  ret.vec.v[2] = sqrtf(value.vec.v[2]);
1994  ret.vec.v[3] = sqrtf(value.vec.v[3]);
1995  return ret;
1996 #elif defined(NLIB_SSE41)
1997  return _mm_sqrt_ps(value);
1998 #elif defined(NLIB_NEON)
1999  f128 iszero = F128::CmpEqZero(value);
2000  f128 result = F128::Mult(value, F128::RecpSqrt(value));
2001  return F128::AndNot(iszero, result);
2002 #elif defined(CAFE)
2003  f128 zero = F128::SetZero();
2004  f128 iszero = F128::CmpEq(zero, value);
2005  f128 result = F128::Mult(value, F128::RecpSqrt(value));
2006  return F128::Select(iszero, zero, result);
2007 #endif
2008 }
2009 
2010 // r[i] = sqrtf(value[i]) with lower precision
2011 NLIB_M(f128) F128::SqrtEst(f128arg value) NLIB_NOEXCEPT {
2012 #ifdef NLIB_F128_SIMD_NOUSE
2013  f128 ret;
2014  ret.vec.v[0] = sqrtf(value.vec.v[0]);
2015  ret.vec.v[1] = sqrtf(value.vec.v[1]);
2016  ret.vec.v[2] = sqrtf(value.vec.v[2]);
2017  ret.vec.v[3] = sqrtf(value.vec.v[3]);
2018  return ret;
2019 #elif defined(NLIB_SSE41)
2020  return _mm_sqrt_ps(value);
2021 #elif defined(NLIB_NEON)
2022  return vrecpeq_f32(vrsqrteq_f32(value));
2023 #elif defined(CAFE)
2024  f128 ret;
2025  ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2026  ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2027  return ret;
2028 #endif
2029 }
2030 
2031 // r[i] = sqrtf(1.f / value[i]) with higher precision
2032 NLIB_M2(f128) F128::RecpSqrt(f128arg value) NLIB_NOEXCEPT {
2033 #ifdef NLIB_F128_SIMD_NOUSE
2034  f128 ret;
2035  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2036  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2037  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2038  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2039  return ret;
2040 #elif defined(NLIB_SSE41)
2041  return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2042 #elif defined(NLIB_NEON)
2043  float32x4_t x;
2044  x = vrsqrteq_f32(value);
2045  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2046  x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2047  f128 zeromask = F128::CmpEqZero(value);
2048  return F128::Select(zeromask, F128::SetInfinity(), x);
2049 #elif defined(CAFE)
2050  f32x2 three = __PS_FDUP(3.f);
2051  f32x2 half = __PS_FDUP(0.5f);
2052  f32x2 x;
2053  f32x2 xx;
2054  f32x2 v;
2055  f128 ret;
2056 
2057  v = value.vec.ps[0];
2058  x = __PS_RSQRTE(v);
2059 
2060  xx = __PS_MUL(x, x);
2061  xx = __PS_NMSUB(v, xx, three);
2062  xx = __PS_MUL(x, xx);
2063  x = __PS_MUL(half, xx);
2064 
2065  xx = __PS_MUL(x, x);
2066  xx = __PS_NMSUB(v, xx, three);
2067  xx = __PS_MUL(x, xx);
2068  ret.vec.ps[0] = __PS_MUL(half, xx);
2069 
2070  v = value.vec.ps[1];
2071  x = __PS_RSQRTE(v);
2072 
2073  xx = __PS_MUL(x, x);
2074  xx = __PS_NMSUB(v, xx, three);
2075  xx = __PS_MUL(x, xx);
2076  x = __PS_MUL(half, xx);
2077 
2078  xx = __PS_MUL(x, x);
2079  xx = __PS_NMSUB(v, xx, three);
2080  xx = __PS_MUL(x, xx);
2081  ret.vec.ps[1] = __PS_MUL(half, xx);
2082 
2083  f128 iszero = F128::CmpEq(F128::SetZero(), value);
2084  f128 inf = F128::SetInfinity();
2085  return F128::Select(iszero, inf, ret);
2086 #endif
2087 }
2088 
2089 // r[i] = sqrtf(1.f / value[i]) with lower precision
2090 NLIB_M(f128) F128::RecpSqrtEst(f128arg value) NLIB_NOEXCEPT {
2091 #ifdef NLIB_F128_SIMD_NOUSE
2092  f128 ret;
2093  ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2094  ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2095  ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2096  ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2097  return ret;
2098 #elif defined(NLIB_SSE41)
2099  return _mm_rsqrt_ps(value);
2100 #elif defined(NLIB_NEON)
2101  return vrsqrteq_f32(value);
2102 #elif defined(CAFE)
2103  f128 ret;
2104  ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2105  ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2106  return ret;
2107 #endif
2108 }
2109 
2110 template<bool NegateLane0, bool NegateLane1, bool NegateLane2, bool NegateLane3>
2111 NLIB_M(f128)
2112 F128::NegateEx(f128arg value) NLIB_NOEXCEPT {
2113  const size_t lane0 = NegateLane0 ? 4 : 0;
2114  const size_t lane1 = NegateLane1 ? 5 : 1;
2115  const size_t lane2 = NegateLane2 ? 6 : 2;
2116  const size_t lane3 = NegateLane3 ? 7 : 3;
2117  return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2118 }
2119 
2120 template<>
2121 NLIB_M(f128)
2122 F128::NegateEx<false, false, false, false>(f128arg value) NLIB_NOEXCEPT {
2123  return value;
2124 }
2125 
2126 template<>
2127 NLIB_M(f128)
2128 F128::NegateEx<true, true, true, true>(f128arg value) NLIB_NOEXCEPT {
2129  return F128::Negate(value);
2130 }
2131 
2132 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2133 #define NLIB_ISNAN(vec, idx) \
2134  ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0)
2135 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U)
2136 #endif
2137 
2138 // r[i] = isnan(value[i]) ? 0xFFFFFFFF : 0
2139 NLIB_M2(f128) F128::IsNaN(f128arg value) NLIB_NOEXCEPT {
2140 #if defined(NLIB_F128_SIMD_NOUSE)
2141  f128 ret;
2142  ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2143  ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2144  ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2145  ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2146  return ret;
2147 #elif defined(CAFE)
2148  // on CAFE, value is NaN if value < 0 && -value < 0
2149  f32x2 one = __PS_FDUP(1.f);
2150  f32x2 minus_one = __PS_NEG(one);
2151  f32x2 v0 = value.vec.ps[0];
2152  f32x2 v1 = value.vec.ps[1];
2153  f32x2 t0 = __PS_SEL(v0, one, minus_one);
2154  f32x2 t1 = __PS_SEL(v1, one, minus_one);
2155  f128 ret;
2156  f32x2 v0neg = __PS_NEG(v0);
2157  f32x2 v1neg = __PS_NEG(v1);
2158  ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2159  ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2160  return ret;
2161 #else
2162  return F128::CmpNe(value, value);
2163 #endif
2164 }
2165 
2166 // r[i] = isinf(value[i]) ? 0xFFFFFFFF : 0
2167 NLIB_M(f128) F128::IsInfinite(f128arg value) NLIB_NOEXCEPT {
2168 #if defined(NLIB_F128_SIMD_NOUSE)
2169  f128 ret;
2170  ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2171  ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2172  ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2173  ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2174  return ret;
2175 #elif defined(CAFE)
2176  f128 ret;
2177  f32x2 big_value = __PS_FDUP(FLT_MAX);
2178  ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2179  ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2180  return ret;
2181 #else
2182  f128 inf_value = F128::SetInfinity();
2183  f128 abs_value = F128::Abs(value);
2184  return F128::CmpEq(inf_value, abs_value);
2185 #endif
2186 }
2187 
2188 // for example, 6.54321 -> 7.0, -6.54321 -> -7.0
2189 NLIB_M(f128) F128::Round(f128arg value) NLIB_NOEXCEPT {
2190 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
2191  return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2192 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE)
2193  return vrndaq_f32(value);
2194 #else
2195  // Add and Sub a big value to round the number after the decimal point
2196  f128 sgn = F128::And(value, F128::SetSignMask());
2197  f128 sm = F128::Or(F128::SetValue(0x4B000000U, each_uint32), sgn);
2198  f128 result = F128::Sub(F128::Add(value, sm), sm);
2199  return result;
2200 #endif
2201 }
2202 
2203 // for example, 6.54321 -> 6.0, -6.54321 -> -6.0
2204 NLIB_M2(f128) F128::Truncate(f128arg value) NLIB_NOEXCEPT {
2205 // Note that there is no fraction if |value| > 2^23
2206 // 2^23 = 8388608
2207 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2208  f128 ret;
2209  for (size_t i = 0; i < 4; ++i) {
2210  if (NLIB_ISNAN(value.vec, i)) {
2211  ret.vec.u[i] = 0x7FC00000U;
2212  } else {
2213  ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2214  ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2215  : value.vec.v[i];
2216  }
2217  }
2218  return ret;
2219 #elif defined(NLIB_SSE41)
2220  return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2221 #elif defined(NLIB_NEON)
2222 #if __ARM_ARCH < 8
2223  f128 x = F128::Abs(value);
2224  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2225  f128 cond = F128::CmpLt(x, c_2_23);
2226  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2227  return F128::Select(cond, casted, value);
2228 #else
2229  return vrndq_f32(value);
2230 #endif
2231 #endif
2232 }
2233 
2234 // for example, 6.54321 -> 6.0, -6.54321 -> -7.0
2235 NLIB_M2(f128) F128::Floor(f128arg value) NLIB_NOEXCEPT {
2236 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2237  f128 ret;
2238  ret.vec.v[0] = floorf(value.vec.v[0]);
2239  ret.vec.v[1] = floorf(value.vec.v[1]);
2240  ret.vec.v[2] = floorf(value.vec.v[2]);
2241  ret.vec.v[3] = floorf(value.vec.v[3]);
2242  return ret;
2243 #elif defined(NLIB_SSE41)
2244  return _mm_floor_ps(value);
2245 #elif defined(NLIB_NEON)
2246 #if __ARM_ARCH < 8
2247  // Note that there is no fraction if |value| > 2^23
2248  // 2^23 = 8388608
2249  f128 x = F128::Abs(value);
2250  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2251  f128 cond = F128::CmpLt(x, c_2_23);
2252  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2253 
2254  // -1 if result is larger
2255  f128 large_mask = F128::CmpGt(casted, value);
2256  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2257  casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(large_mask)));
2258  return F128::Select(cond, casted, value);
2259 #else
2260  return vrndmq_f32(value);
2261 #endif
2262 #endif
2263 }
2264 
2265 // for example, 6.54321 -> 7.0, -6.54321 -> -6.0
2266 NLIB_M2(f128) F128::Ceil(f128arg value) NLIB_NOEXCEPT {
2267 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2268  f128 ret;
2269  ret.vec.v[0] = ceilf(value.vec.v[0]);
2270  ret.vec.v[1] = ceilf(value.vec.v[1]);
2271  ret.vec.v[2] = ceilf(value.vec.v[2]);
2272  ret.vec.v[3] = ceilf(value.vec.v[3]);
2273  return ret;
2274 #elif defined(NLIB_SSE41)
2275  return _mm_ceil_ps(value);
2276 #elif defined(NLIB_NEON)
2277 #if __ARM_ARCH < 8
2278  // Note that there is no fraction if |value| > 2^23
2279  // 2^23 = 8388608
2280  f128 x = F128::Abs(value);
2281  f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2282  f128 cond = F128::CmpLt(x, c_2_23);
2283  f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2284 
2285  // +1 if result is smaller
2286  f128 small_mask = F128::CmpLt(casted, value);
2287  // 0xFFFFFFFF -> -1 -> -1.f, 0 -> 0 -> 0.f
2288  casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(small_mask)));
2289  return F128::Select(cond, casted, value);
2290 #else
2291  return vrndpq_f32(value);
2292 #endif
2293 #endif
2294 }
2295 
2296 #ifdef NLIB_F128_SIMD_NOUSE
2297 #undef NLIB_ISNAN
2298 #undef NLIB_ISINF
2299 #endif
2300 
2301 // r[i] = clamp(value[i], { 0, 0, 0, 0 }, { 1, 1, 1, 1 })
2302 NLIB_M(f128) F128::Saturate(f128arg value) NLIB_NOEXCEPT {
2303  return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2304 }
2305 
2306 NLIB_M2(f128) F128::ModAngle(f128arg value) NLIB_NOEXCEPT {
2307  static const float v_1_2pi = 0.15915494309189535f;
2308  static const float v_2pi = 6.283185307179586f;
2309  // value - 2pi * round(value * (1/2pi)) to be [-pi, pi)
2310  const f128 recp_two_pi = F128::SetValue(v_1_2pi, each_float);
2311  f128 round = F128::Round(F128::Mult(value, recp_two_pi));
2312  const f128 two_pi = F128::SetValue(v_2pi, each_float);
2313  return F128::MultSub(two_pi, round, value);
2314 }
2315 
2316 NLIB_M2(f128) F128::Sin(f128arg value) NLIB_NOEXCEPT {
2317  // within [-pi, pi)
2318  f128 x = F128::ModAngle(value);
2319 
2320  // within [-pi/2, pi/2]
2321  // use sin(x) == sin(pi - x), sin(x) == sin(-pi - x)
2322  // |x| <= pi/2 -> x = x
2323  // x > pi/2 -> x = pi - x
2324  // x < -pi/2 -> x = -pi - x
2325  f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2326  f128 pi = F128::SetValue<0>(sin_cvalue, each_select32);
2327  f128 pidiv2 = F128::SetValue<1>(sin_cvalue, each_select32);
2328 
2329  f128 xabs = F128::Abs(value);
2330  f128 xsign = F128::And(F128::SetSignMask(), x);
2331  f128 mypi = F128::Or(xsign, pi);
2332  f128 pi_x = F128::Sub(mypi, x);
2333  f128 cond = F128::CmpLe(xabs, pidiv2);
2334  x = F128::Select(cond, x, pi_x);
2335 
2336  f128 xx = F128::Mult(x, x);
2337  f128 coeff = F128::LoadA16(sin_coeff_);
2338  f128 result;
2339  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2340 
2341  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2342  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2343  result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue, each_select32));
2344  result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue, each_select32));
2345  result = F128::Mult(xx, result);
2346  result = F128::MultSub(result, x, x);
2347  return result;
2348 }
2349 
2350 NLIB_M2(f128) F128::Cos(f128arg value) NLIB_NOEXCEPT {
2351  // within [-pi, pi)
2352  f128 x = F128::ModAngle(value);
2353 
2354  // within [-pi/2, pi/2]
2355  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2356  // |x| <= pi/2 -> x = x
2357  // x > pi/2 -> x = pi - x
2358  // x < -pi/2 -> x = -pi - x
2359  f128 cvalue = F128::LoadA16(cos_cvalue_);
2360 
2361  f128 xabs = F128::Abs(value);
2362  f128 xsign = F128::And(F128::SetSignMask(), x);
2363  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2364  f128 pi_x = F128::Sub(mypi, x);
2365  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2366  x = F128::Select(cond, x, pi_x);
2367 
2368  // +1 if [-pi/2, pi/2], -1 otherwise
2369  f128 sign = F128::AndNot(cond, F128::SetSignMask());
2370 
2371  // xx = x^2
2372  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2373  f128 xx = F128::Mult(x, x);
2374  f128 coeff = F128::LoadA16(cos_coeff_);
2375  f128 result;
2376  result = F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2377 
2378  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2379  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2380  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2381  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2382  result = F128::MultSub(xx, result, F128::SetOne());
2383  result = F128::Xor(sign, result);
2384  return result;
2385 }
2386 
2387 NLIB_M2(f128x2) F128::SinCos(f128arg value) NLIB_NOEXCEPT {
2388  // within [-pi, pi)
2389  const f128 signmask = F128::SetSignMask();
2390  f128 x = F128::ModAngle(value);
2391 
2392  // within [-pi/2, pi/2]
2393  // use cos(x) = -cos(pi - x), cos(x) = -cos(-pi - x)
2394  // |x| <= pi/2 -> x = x
2395  // x > pi/2 -> x = pi - x
2396  // x < -pi/2 -> x = -pi - x
2397  f128 cvalue = F128::LoadA16(cos_cvalue_);
2398 
2399  f128 xabs = F128::Abs(value);
2400  f128 xsign = F128::And(signmask, x);
2401  f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue, each_select32)); // pi
2402  f128 pi_x = F128::Sub(mypi, x);
2403  f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue, each_select32)); // pi/2
2404  x = F128::Select(cond, x, pi_x);
2405 
2406  // +1 if [-pi/2, pi/2], -1 otherwise
2407  f128 sign = F128::AndNot(cond, signmask);
2408 
2409  // xx = x^2
2410  // 1 - xx * (1/2 - xx * (1/24 - xx * (1/720 - xx * (1/40320 - xx/3628800))))
2411  f128 xx = F128::Mult(x, x);
2412  f128x2 ret;
2413 
2414  // cos
2415  {
2416  f128 coeff = F128::LoadA16(cos_coeff_);
2417  f128 result;
2418  result =
2419  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2420 
2421  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2422  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2423  result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue, each_select32));
2424  result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue, each_select32));
2425  result = F128::MultSub(xx, result, F128::SetOne());
2426 
2427  ret.val[1] = F128::Xor(sign, result); // cos
2428  }
2429 
2430  // sin
2431  {
2432  f128 coeff = F128::LoadA16(sin_coeff_);
2433  f128 result;
2434  result =
2435  F128::MultSub<0>(coeff, xx, F128::SetValue<1>(coeff, each_select32), each_select32);
2436 
2437  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff, each_select32));
2438  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff, each_select32));
2439  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2], each_float));
2440  result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3], each_float));
2441  result = F128::Mult(xx, result);
2442  ret.val[0] = F128::MultSub(result, x, x); // sin
2443  }
2444  return ret;
2445 }
2446 
2447 NLIB_M2(f128) F128::ArcTan(f128arg value) NLIB_NOEXCEPT {
2448  // |value| <= 1 -> atan(value)
2449  // value > 1 -> pi/2 - atan(1/value)
2450  // value < -1 -> -pi/2 - atan(1/value)
2451  f128 cmp, value_sign;
2452  {
2453  f128 one = F128::SetOne();
2454 
2455  // value_sign:
2456  // 1 if value > 1,
2457  // -1 if value < -1
2458  value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2459  cmp = F128::CmpLe(F128::Abs(value), one);
2460  }
2461  f128 x = F128::Select(cmp, value, F128::Recp(value));
2462 
2463  // atan(x) = x - 1/3 * x^3 + ... + (-1)^n/(2n+1) * x^(2n+1)
2464  // = x(1 - xx(1/3 - xx(1/5 - xx(1/7 - xx(1/9 - xx(1/11 - xx(1/13 - xx(1/15 - xx(1/17)...)
2465  // NOTE:
2466  // DO NOT USE TAYLOR SERIES
2467  // minmax approximation(the output of Remez algorithm)
2468  f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2469  f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2470  f128 xx = F128::Mult(x, x);
2471  f128 result;
2472  result = F128::MultSub<3>(coeff1, xx, F128::SetValue<2>(coeff1, each_select32), each_select32);
2473  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1, each_select32));
2474  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1, each_select32));
2475  result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0, each_select32));
2476  result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0, each_select32));
2477  result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0, each_select32));
2478  result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0, each_select32));
2479 
2480  result = F128::Mult(result, x);
2481  result = F128::MultSub(xx, result, x);
2482 
2483  f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2484  f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2485  result = F128::Select(cmp, result, result_another);
2486  return result;
2487 }
2488 
2489 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT {
2490  // y / x -> value
2491  // 0 / - -> pi | sign(y)
2492  // 0 / + -> 0 | sign(y)
2493  // y / Inf -> 0 | sign(y)
2494  // y / -Inf -> pi | sign(y)
2495  // y!=0 / 0 -> pi/2 | sign(y)
2496  // y!=0 / - -> atan(y/x) + pi | sign(y)
2497  // Inf / x -> pi/2 | sign(y)
2498  // +-Inf / Inf -> pi/4 | sign(y)
2499  // +-Inf / -Inf -> 3pi/4 | sign(y)
2500  // otherwise -> atan(y/x)
2501 
2502  // sx = sign(x), sy = sign(y)
2503  // infx = isinf(x), infy = isinf(y)
2504  // zerox = iszero(x), zeroy = iszero(y)
2505  // posx = x > 0
2506  const f128 signmask = F128::SetSignMask();
2507  // const f128 sx = F128::And(x, signmask);
2508  const f128 sy = F128::And(y, signmask);
2509  const f128 infx = F128::IsInfinite(x);
2510  const f128 infy = F128::IsInfinite(y);
2511  const f128 zerox = F128::CmpEqZero(x);
2512  const f128 zeroy = F128::CmpEqZero(y);
2513  const f128 posx = F128::CmpGtZero(x);
2514 
2515  // v =
2516  // infy ?
2517  // infx ?
2518  // posx ? (pi/4 | sy) : (3pi/4 | sy)
2519  // : (pi/2 | sy)
2520  // : zeroy ?
2521  // posx ? (0 | sy) : (pi | sy)
2522  // : zerox ? (pi/2 | sy) : TrueMask;
2523  const f128 cval = F128::LoadA16(atan2_cvalue_);
2524  const f128 pi = F128::Or(sy, F128::SetValue<0>(cval, each_select32));
2525  const f128 pi_34 = F128::Or(sy, F128::SetValue<1>(cval, each_select32));
2526  const f128 pi_2 = F128::Or(sy, F128::SetValue<2>(cval, each_select32));
2527  const f128 pi_4 = F128::Or(sy, F128::SetValue<3>(cval, each_select32));
2528 
2529  f128 v = F128::Select(infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2530  F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2531 
2532 // mask = EqInt(v, full);
2533 // result = Atan(y/x) + (posx ? (0 | sy) : (pi | sy))
2534 // return mask ? result : v;
2535 #if defined(NLIB_F128_SIMD_NOUSE)
2536  f128 mask;
2537  mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2538  mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2539  mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2540  mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2541 #elif defined(CAFE)
2542  // select makes 0xFFFFFFFFUL -> 0xFF7FFFFFUL
2543  f128 mask;
2544  mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2545  mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2546  mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2547  mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2548 #else
2549  f128 mask =
2550  F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v), I128::SetValue(-1, each_int8)));
2551 #endif
2552  f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2553  return F128::Select(mask, result, v);
2554 }
2555 
2556 NLIB_M2(f128) F128::ArcSin(f128arg value) NLIB_NOEXCEPT {
2557  // asin(x) = atan2 (x, sqrt ((1.0 + x) * (1.0 - x)))
2558  f128 one = F128::SetOne();
2559  f128 tmp = F128::MultSub(value, value, one);
2560  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2561  return F128::ArcTan2(value, argx);
2562 }
2563 
2564 NLIB_M2(f128) F128::ArcCos(f128arg value) NLIB_NOEXCEPT {
2565  // acos(x) = atan2 (sqrt ((1.0 + x) * (1.0 - x)), x)
2566  f128 one = F128::SetOne();
2567  f128 tmp = F128::MultSub(value, value, one);
2568  f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2569  return F128::ArcTan2(argx, value);
2570 }
2571 
2572 // see _mm_movemask_ps of SSE
2573 NLIB_M2(int) F128::MoveMask(f128arg value) NLIB_NOEXCEPT {
2574 #ifdef NLIB_F128_SIMD_NOUSE
2575  uint8_t ret = 0;
2576  ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2577  ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2578  ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2579  ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2580  return ret;
2581 #elif defined(NLIB_SSE41)
2582  return static_cast<uint8_t>(_mm_movemask_ps(value));
2583 #elif defined(NLIB_NEON)
2584  uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2585  uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2586  uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2587  uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2588 #ifdef __aarch64__
2589  return vaddvq_u32(a);
2590 #else
2591  uint16x4_t tmp = vmovn_u32(a);
2592  tmp = vpadd_u16(tmp, tmp);
2593  tmp = vpadd_u16(tmp, tmp);
2594  return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2595 #endif
2596 #elif defined(CAFE)
2597  int tmp = (value.vec.u[0] >> 31);
2598  tmp |= (value.vec.u[1] >> 30) & 2;
2599  tmp |= (value.vec.u[2] >> 29) & 4;
2600  tmp |= (value.vec.u[3] >> 28) & 8;
2601  return tmp;
2602 #endif
2603 }
2604 
2605 // true if value[i] == 0 for all i
2606 NLIB_M2(bool) F128::IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT {
2607 #ifdef NLIB_F128_SIMD_NOUSE
2608  return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2609 #elif defined(NLIB_SSE41)
2610  i128 casted = F128::CastToI128(value);
2611  return _mm_testz_si128(casted, casted) != 0;
2612 #elif defined(NLIB_NEON)
2613 #ifdef __aarch64__
2614  uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2615  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2616 #else
2617  int32x4_t casted = vreinterpretq_s32_f32(value);
2618  int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2619  return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2620 #endif
2621 #elif defined(CAFE)
2622  uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2623  return (tmp & 0x80000000U) == 0;
2624 #endif
2625 }
2626 
2627 // true if value[i] == 0xFFFFFFFF for all i
2628 NLIB_M2(bool) F128::IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT {
2629 #ifdef NLIB_F128_SIMD_NOUSE
2630  return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2631  value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2632 #elif defined(NLIB_SSE41)
2633  i128 casted = F128::CastToI128(value);
2634  return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2635 #elif defined(NLIB_NEON)
2636 #ifdef __aarch64__
2637  uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2638  return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2639 #else
2640  int32x4_t casted = vreinterpretq_s32_f32(value);
2641  int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2642  return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2643 #endif
2644 #elif defined(CAFE)
2645  uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2646  return (tmp & 0x80000000U) != 0;
2647 #endif
2648 }
2649 
2650 template<size_t N>
2651 // r = value[N]
2652 NLIB_M(float) F128::GetFloatFromLane(f128arg value) NLIB_NOEXCEPT {
2653  NLIB_STATIC_ASSERT(N < 4);
2654 #ifdef NLIB_F128_SIMD_NOUSE
2655  return value.vec.v[N];
2656 #elif defined(NLIB_SSE41)
2657  float dest;
2658  _MM_EXTRACT_FLOAT(dest, value, N);
2659  return dest;
2660 #elif defined(NLIB_NEON)
2661  return vgetq_lane_f32(value, N);
2662 #elif defined(CAFE)
2663  return value.vec.ps[N / 2][N % 2];
2664 #endif
2665 }
2666 
2667 template<size_t N>
2668 // r = *reinterpret_cast<uint32_t*>(&value[N])
2669 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value) NLIB_NOEXCEPT {
2670  NLIB_STATIC_ASSERT(N < 4);
2671 #ifdef NLIB_F128_SIMD_NOUSE
2672  return value.vec.u[N];
2673 #elif defined(NLIB_SSE41)
2674  return _mm_extract_ps(value, N);
2675 #elif defined(NLIB_NEON)
2676  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2677  return vgetq_lane_u32(tmp, N);
2678 #elif defined(CAFE)
2679  return value.vec.u[N];
2680 #endif
2681 }
2682 
2683 // r = value[idx]
2684 NLIB_M2(float) F128::GetFloatByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT {
2685 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2686  return value.vec.v[idx];
2687 #elif defined(NLIB_SSE41)
2688  float dest;
2689  switch (idx) {
2690  case 0:
2691  _MM_EXTRACT_FLOAT(dest, value, 0);
2692  break;
2693  case 1:
2694  _MM_EXTRACT_FLOAT(dest, value, 1);
2695  break;
2696  case 2:
2697  _MM_EXTRACT_FLOAT(dest, value, 2);
2698  break;
2699  case 3:
2700  _MM_EXTRACT_FLOAT(dest, value, 3);
2701  break;
2702  default:
2703  NLIB_ASSUME(0);
2704  break;
2705  }
2706  return dest;
2707 #elif defined(NLIB_NEON)
2708  switch (idx) {
2709  case 0:
2710  return vgetq_lane_f32(value, 0);
2711  case 1:
2712  return vgetq_lane_f32(value, 1);
2713  case 2:
2714  return vgetq_lane_f32(value, 2);
2715  case 3:
2716  return vgetq_lane_f32(value, 3);
2717  default:
2718  NLIB_ASSUME(0);
2719  break;
2720  }
2721 #endif
2722 }
2723 
2724 // r = *reinterpret_cast<uint32_t*>(&value[idx])
2725 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value, size_t idx) NLIB_NOEXCEPT {
2726 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
2727  return value.vec.u[idx];
2728 #elif defined(NLIB_SSE41)
2729  switch (idx) {
2730  case 0:
2731  return static_cast<uint32_t>(_mm_extract_ps(value, 0));
2732  case 1:
2733  return static_cast<uint32_t>(_mm_extract_ps(value, 1));
2734  case 2:
2735  return static_cast<uint32_t>(_mm_extract_ps(value, 2));
2736  case 3:
2737  return static_cast<uint32_t>(_mm_extract_ps(value, 3));
2738  default:
2739  NLIB_ASSUME(0);
2740  break;
2741  }
2742 #elif defined(NLIB_NEON)
2743  uint32x4_t tmp = vreinterpretq_u32_f32(value);
2744  switch (idx) {
2745  case 0:
2746  return vgetq_lane_u32(tmp, 0);
2747  case 1:
2748  return vgetq_lane_u32(tmp, 1);
2749  case 2:
2750  return vgetq_lane_u32(tmp, 2);
2751  case 3:
2752  return vgetq_lane_u32(tmp, 3);
2753  default:
2754  NLIB_ASSUME(0);
2755  break;
2756  }
2757 #endif
2758 }
2759 
2760 template<size_t N>
2761 // r = value, r[N] = v
2762 NLIB_M(f128) F128::SetFloatToLane(f128arg value, float v) NLIB_NOEXCEPT {
2763  NLIB_STATIC_ASSERT(N < 4);
2764 #ifdef NLIB_F128_SIMD_NOUSE
2765  f128 ret = value;
2766  ret.vec.v[N] = v;
2767  return ret;
2768 #elif defined(NLIB_SSE41)
2769  f128 tmp = _mm_set_ss(v);
2770  return _mm_insert_ps(value, tmp, N << 4);
2771 #elif defined(NLIB_NEON)
2772  return __builtin_constant_p(v) ? F128::Permute < N == 0 ? 4 : 0, N == 1 ? 5 : 1, N == 2 ? 6 : 2,
2773  N == 3 ? 7 : 3 > (value, vdupq_n_f32(v)) : vsetq_lane_f32(v, value, N);
2774 #elif defined(CAFE)
2775  f128 ret = value;
2776  ret.vec.ps[N / 2][N % 2] = v;
2777  return ret;
2778 #endif
2779 }
2780 
2781 // r = value, r[i] = v
2782 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value, float v, size_t i) NLIB_NOEXCEPT {
2783 #ifdef NLIB_F128_SIMD_NOUSE
2784  f128 ret = value;
2785  ret.vec.v[i] = v;
2786  return ret;
2787 #elif defined(NLIB_SSE41)
2788  f128 tmp = _mm_set_ss(v);
2789  switch (i) {
2790  case 0:
2791  return _mm_insert_ps(value, tmp, 0x00);
2792  case 1:
2793  return _mm_insert_ps(value, tmp, 0x10);
2794  case 2:
2795  return _mm_insert_ps(value, tmp, 0x20);
2796  case 3:
2797  return _mm_insert_ps(value, tmp, 0x30);
2798  default:
2799  NLIB_ASSUME(0);
2800  break;
2801  }
2802 #elif defined(NLIB_NEON)
2803  switch (i) {
2804  case 0:
2805  return F128::SetFloatToLane<0>(value, v);
2806  case 1:
2807  return F128::SetFloatToLane<1>(value, v);
2808  case 2:
2809  return F128::SetFloatToLane<2>(value, v);
2810  case 3:
2811  return F128::SetFloatToLane<3>(value, v);
2812  default:
2813  NLIB_ASSUME(0);
2814  break;
2815  }
2816 #elif defined(CAFE)
2817  f128 ret = value;
2818  switch (i) {
2819  case 0:
2820  ret.vec.ps[0][0] = v;
2821  break;
2822  case 1:
2823  ret.vec.ps[0][1] = v;
2824  break;
2825  case 2:
2826  ret.vec.ps[1][0] = v;
2827  break;
2828  default:
2829  ret.vec.ps[1][1] = v;
2830  break;
2831  }
2832  return ret;
2833 #endif
2834 }
2835 
2836 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
2837 namespace detail {
2838 
2839 template<bool IsHighA, bool IsHighB>
2840 float32x2_t F64Merge(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT;
2841 
2842 template<>
2843 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2844 #ifdef __aarch64__
2845  return vtrn1_f32(a, b);
2846 #else
2847  return vtrn_f32(a, b).val[0];
2848 #endif
2849 };
2850 
2851 template<>
2852 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2853 #ifdef __aarch64__
2854  return vtrn1_f32(vrev64_f32(a), b);
2855 #else
2856  return vtrn_f32(vrev64_f32(a), b).val[0];
2857 #endif
2858 };
2859 
2860 template<>
2861 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2862 #ifdef __aarch64__
2863  return vtrn1_f32(a, vrev64_f32(b));
2864 #else
2865  return vtrn_f32(a, vrev64_f32(b)).val[0];
2866 #endif
2867 };
2868 
2869 template<>
2870 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2871 #ifdef __aarch64__
2872  return vtrn2_f32(a, b);
2873 #else
2874  return vtrn_f32(a, b).val[1];
2875 #endif
2876 };
2877 
2878 template<int Z>
2879 float32x2_t F128SwizzleGet64(f128arg value) NLIB_NOEXCEPT;
2880 
2881 template<>
2882 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<0>(f128arg value) NLIB_NOEXCEPT {
2883  return vget_low_f32(value);
2884 }
2885 
2886 template<>
2887 NLIB_ALWAYS_INLINE float32x2_t F128SwizzleGet64<1>(f128arg value) NLIB_NOEXCEPT {
2888  return vget_high_f32(value);
2889 }
2890 
2891 template<int X0, int X1>
2892 struct F128SwizzleHelper2 {
2893  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2894  float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2895  float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2896  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2897  }
2898 };
2899 
2900 template<int X>
2901 struct F128SwizzleHelper2<X, X> {
2902  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2903  float32x2_t x = F128SwizzleGet64<X / 2>(value);
2904  return vdup_lane_f32(x, (X & 1));
2905  }
2906 };
2907 
2908 template<>
2909 struct F128SwizzleHelper2<0, 1> {
2910  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2911  return vget_low_f32(value);
2912  }
2913 };
2914 
2915 template<>
2916 struct F128SwizzleHelper2<0, 2> {
2917  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2918 #ifdef __aarch64__
2919  return vget_low_f32(vuzp1q_f32(value, value));
2920 #else
2921  float32x2_t lo = vget_low_f32(value);
2922  float32x2_t hi = vget_high_f32(value);
2923  return vzip_f32(lo, hi).val[0];
2924 #endif
2925  }
2926 };
2927 
2928 template<>
2929 struct F128SwizzleHelper2<0, 3> {
2930  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2931  float32x2_t lo = vget_low_f32(value);
2932  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2933 #ifdef __aarch64__
2934  return vzip1_f32(lo, hi);
2935 #else
2936  return vzip_f32(lo, hi).val[0];
2937 #endif
2938  }
2939 };
2940 
2941 template<>
2942 struct F128SwizzleHelper2<1, 0> {
2943  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2944  return vrev64_f32(vget_low_f32(value));
2945  }
2946 };
2947 
2948 template<>
2949 struct F128SwizzleHelper2<1, 2> {
2950  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2951  float32x2_t lo = vget_low_f32(value);
2952  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2953 #ifdef __aarch64__
2954  return vzip2_f32(lo, hi);
2955 #else
2956  return vzip_f32(lo, hi).val[1];
2957 #endif
2958  }
2959 };
2960 
2961 template<>
2962 struct F128SwizzleHelper2<1, 3> {
2963  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2964 #ifdef __aarch64__
2965  return vget_low_f32(vuzp2q_f32(value, value));
2966 #else
2967  float32x2_t lo = vget_low_f32(value);
2968  float32x2_t hi = vget_high_f32(value);
2969  return vzip_f32(lo, hi).val[1];
2970 #endif
2971  }
2972 };
2973 
2974 template<>
2975 struct F128SwizzleHelper2<2, 0> {
2976  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2977 #ifdef __aarch64__
2978  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2979 #else
2980  float32x2_t lo = vget_low_f32(value);
2981  float32x2_t hi = vget_high_f32(value);
2982  return vzip_f32(hi, lo).val[0];
2983 #endif
2984  }
2985 };
2986 
2987 template<>
2988 struct F128SwizzleHelper2<2, 1> {
2989  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
2990 #ifdef __aarch64__
2991  return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2992 #else
2993  float32x2_t lo = vget_low_f32(value);
2994  float32x2_t hi = vrev64_f32(vget_high_f32(value));
2995  return vzip_f32(hi, lo).val[1];
2996 #endif
2997  }
2998 };
2999 
3000 template<>
3001 struct F128SwizzleHelper2<2, 3> {
3002  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3003  return vget_high_f32(value);
3004  }
3005 };
3006 
3007 template<>
3008 struct F128SwizzleHelper2<3, 0> {
3009  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3010  float32x2_t lo = vget_low_f32(value);
3011  float32x2_t hi = vrev64_f32(vget_high_f32(value));
3012 #ifdef __aarch64__
3013  return vzip1_f32(hi, lo);
3014 #else
3015  return vzip_f32(hi, lo).val[0];
3016 #endif
3017  }
3018 };
3019 
3020 template<>
3021 struct F128SwizzleHelper2<3, 1> {
3022  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3023  float32x2_t lo = vget_low_f32(value);
3024  float32x2_t hi = vget_high_f32(value);
3025 #ifdef __aarch64__
3026  return vzip2_f32(hi, lo);
3027 #else
3028  return vzip_f32(hi, lo).val[1];
3029 #endif
3030  }
3031 };
3032 
3033 template<>
3034 struct F128SwizzleHelper2<3, 2> {
3035  static NLIB_ALWAYS_INLINE float32x2_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3036  return vrev64_f32(vget_high_f32(value));
3037  }
3038 };
3039 
3040 template<int V0, int V1, int V2, int V3>
3041 struct F128SwizzleHelper {
3042  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3043  return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3044  detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3045  }
3046 };
3047 
3048 template<int Vx, int Vy>
3049 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3050  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3051  float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3052  return vcombine_f32(tmp, tmp);
3053  }
3054 };
3055 
3056 template<int V>
3057 struct F128SwizzleHelper<V, V, V, V> {
3058  static NLIB_ALWAYS_INLINE float32x4_t Swizzle(f128arg value) NLIB_NOEXCEPT {
3059  return F128::SetValue<V>(value, each_select32);
3060  }
3061 };
3062 
3063 } // namespace detail
3064 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3065 namespace detail {
3066 
3067 template<int X0, int X1>
3068 struct F128SwizzleHelper {
3069  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT;
3070 };
3071 
3072 template<>
3073 struct F128SwizzleHelper<0, 0> {
3074  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3075  (void)v1;
3076  return __PS_MERGE00(v0, v0);
3077  }
3078 };
3079 
3080 template<>
3081 struct F128SwizzleHelper<0, 1> {
3082  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3083  (void)v1;
3084  return v0;
3085  }
3086 };
3087 
3088 template<>
3089 struct F128SwizzleHelper<0, 2> {
3090  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3091  return __PS_MERGE00(v0, v1);
3092  }
3093 };
3094 
3095 template<>
3096 struct F128SwizzleHelper<0, 3> {
3097  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3098  return __PS_MERGE01(v0, v1);
3099  }
3100 };
3101 
3102 template<>
3103 struct F128SwizzleHelper<1, 0> {
3104  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3105  (void)v1;
3106  return __PS_MERGE10(v0, v0);
3107  }
3108 };
3109 
3110 template<>
3111 struct F128SwizzleHelper<1, 1> {
3112  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3113  (void)v1;
3114  return __PS_MERGE11(v0, v0);
3115  }
3116 };
3117 
3118 template<>
3119 struct F128SwizzleHelper<1, 2> {
3120  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3121  return __PS_MERGE10(v0, v1);
3122  }
3123 };
3124 
3125 template<>
3126 struct F128SwizzleHelper<1, 3> {
3127  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3128  return __PS_MERGE11(v0, v1);
3129  }
3130 };
3131 
3132 template<>
3133 struct F128SwizzleHelper<2, 0> {
3134  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3135  return __PS_MERGE00(v1, v0);
3136  }
3137 };
3138 
3139 template<>
3140 struct F128SwizzleHelper<2, 1> {
3141  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3142  return __PS_MERGE01(v1, v0);
3143  }
3144 };
3145 
3146 template<>
3147 struct F128SwizzleHelper<2, 2> {
3148  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3149  (void)v0;
3150  return __PS_MERGE00(v1, v1);
3151  }
3152 };
3153 
3154 template<>
3155 struct F128SwizzleHelper<2, 3> {
3156  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3157  (void)v0;
3158  return v1;
3159  }
3160 };
3161 
3162 template<>
3163 struct F128SwizzleHelper<3, 0> {
3164  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3165  return __PS_MERGE10(v1, v0);
3166  }
3167 };
3168 
3169 template<>
3170 struct F128SwizzleHelper<3, 1> {
3171  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3172  return __PS_MERGE11(v1, v0);
3173  }
3174 };
3175 
3176 template<>
3177 struct F128SwizzleHelper<3, 2> {
3178  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3179  (void)v0;
3180  return __PS_MERGE10(v1, v1);
3181  }
3182 };
3183 
3184 template<>
3185 struct F128SwizzleHelper<3, 3> {
3186  static NLIB_ALWAYS_INLINE f32x2 Swizzle(f32x2 v0, f32x2 v1) NLIB_NOEXCEPT {
3187  (void)v0;
3188  return __PS_MERGE11(v1, v1);
3189  }
3190 };
3191 
3192 } // namespace detail
3193 #endif
3194 
3195 template<int V0, int V1, int V2, int V3>
3196 // r[0] = value[V0], r[1] = value[V1], r[2] = value[V2], r[3] = value[V3]
3197 NLIB_M(f128) F128::Swizzle(f128arg value) NLIB_NOEXCEPT {
3198  NLIB_STATIC_ASSERT(V0 < 4);
3199  NLIB_STATIC_ASSERT(V1 < 4);
3200  NLIB_STATIC_ASSERT(V2 < 4);
3201  NLIB_STATIC_ASSERT(V3 < 4);
3202 #if defined(NLIB_F128_SIMD_NOUSE)
3203  f128 ret;
3204  ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3205  ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3206  ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3207  ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3208  return ret;
3209 #elif __has_builtin(__builtin_shufflevector)
3210  return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3211 #elif defined(NLIB_SSE41)
3212  return _mm_shuffle_ps(
3213  value, value,
3214  _MM_SHUFFLE(V3 != -1 ? V3 : 3, V2 != -1 ? V2 : 2, V1 != -1 ? V1 : 1, V0 != -1 ? V0 : 0));
3215 #elif defined(NLIB_NEON)
3216  return detail::F128SwizzleHelper < V0 != -1 ? V0 : 0, V1 != -1 ? V1 : 1, V2 != -1 ? V2 : 2,
3217  V3 != -1 ? V3 : 3 > ::Swizzle(value);
3218 #elif defined(CAFE)
3219  f128 ret;
3220  ret.vec.ps[0] = detail::F128SwizzleHelper<(V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(
3221  value.vec.ps[0], value.vec.ps[1]);
3222  ret.vec.ps[1] = detail::F128SwizzleHelper<(V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(
3223  value.vec.ps[0], value.vec.ps[1]);
3224  return ret;
3225 #endif
3226 }
3227 
3228 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3229 // Swizzle specialization for NEON
3230 template<>
3231 NLIB_M(f128)
3232 F128::Swizzle<0, 0, 1, 1>(f128arg value) NLIB_NOEXCEPT {
3233 #ifdef __aarch64__
3234  return vzip1q_f32(value, value);
3235 #else
3236  return vzipq_f32(value, value).val[0];
3237 #endif
3238 }
3239 template<>
3240 NLIB_M(f128)
3241 F128::Swizzle<0, 0, 2, 2>(f128arg value) NLIB_NOEXCEPT {
3242 #ifdef __aarch64__
3243  return vtrn1q_f32(value, value);
3244 #else
3245  return vtrnq_f32(value, value).val[0];
3246 #endif
3247 }
3248 template<>
3249 NLIB_M(f128)
3250 F128::Swizzle<0, 1, 2, 3>(f128arg value) NLIB_NOEXCEPT {
3251  return value;
3252 }
3253 template<>
3254 NLIB_M(f128)
3255 F128::Swizzle<0, 2, 0, 2>(f128arg value) NLIB_NOEXCEPT {
3256 #ifdef __aarch64__
3257  return vuzp1q_f32(value, value);
3258 #else
3259  return vuzpq_f32(value, value).val[0];
3260 #endif
3261 }
3262 template<>
3263 NLIB_M(f128)
3264 F128::Swizzle<1, 0, 3, 2>(f128arg value) NLIB_NOEXCEPT {
3265  return vrev64q_f32(value);
3266 }
3267 template<>
3268 NLIB_M(f128)
3269 F128::Swizzle<1, 1, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3270 #ifdef __aarch64__
3271  return vtrn2q_f32(value, value);
3272 #else
3273  return vtrnq_f32(value, value).val[1];
3274 #endif
3275 }
3276 template<>
3277 NLIB_M(f128)
3278 F128::Swizzle<1, 2, 3, 0>(f128arg value) NLIB_NOEXCEPT {
3279  uint32x4_t ival = vreinterpretq_u32_f32(value);
3280  uint32x4_t rotated = vextq_u32(ival, ival, 1);
3281  return vreinterpretq_f32_u32(rotated);
3282 }
3283 template<>
3284 NLIB_M(f128)
3285 F128::Swizzle<1, 3, 1, 3>(f128arg value) NLIB_NOEXCEPT {
3286 #ifdef __aarch64__
3287  return vuzp2q_f32(value, value);
3288 #else
3289  return vuzpq_f32(value, value).val[1];
3290 #endif
3291 }
3292 template<>
3293 NLIB_M(f128)
3294 F128::Swizzle<2, 2, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3295 #ifdef __aarch64__
3296  return vzip2q_f32(value, value);
3297 #else
3298  return vzipq_f32(value, value).val[1];
3299 #endif
3300 }
3301 template<>
3302 NLIB_M(f128)
3303 F128::Swizzle<2, 3, 0, 1>(f128arg value) NLIB_NOEXCEPT {
3304  uint32x4_t ival = vreinterpretq_u32_f32(value);
3305  uint32x4_t rotated = vextq_u32(ival, ival, 2);
3306  return vreinterpretq_f32_u32(rotated);
3307 }
3308 template<>
3309 NLIB_M(f128)
3310 F128::Swizzle<3, 0, 1, 2>(f128arg value) NLIB_NOEXCEPT {
3311  uint32x4_t ival = vreinterpretq_u32_f32(value);
3312  uint32x4_t rotated = vextq_u32(ival, ival, 3);
3313  return vreinterpretq_f32_u32(rotated);
3314 }
3315 #endif
3316 
3317 namespace detail {
3318 
3319 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3320 template<bool UseBlend, bool UseShuffle, int V0, int V1, int V2, int V3>
3321 struct F128PermuteHelper2 {
3322  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3323  f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3324  f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3325  return _mm_blend_ps(
3326  as, bs,
3327  (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) | ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3328  }
3329 };
3330 
3331 template<bool UseShuffle, int V0, int V1, int V2, int V3>
3332 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3333  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3334  return _mm_blend_ps(
3335  a, b,
3336  (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) | ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3337  }
3338 };
3339 
3340 template<int V0, int V1, int V2, int V3>
3341 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3342  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3343  return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3344  _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3345  }
3346 };
3347 
3348 template<>
3349 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3350  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3351  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3352  return _mm_castsi128_ps(tmp);
3353  }
3354 };
3355 
3356 template<>
3357 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3358  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3359  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3360  return _mm_castsi128_ps(tmp);
3361  }
3362 };
3363 
3364 template<>
3365 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3366  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3367  __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3368  return _mm_castsi128_ps(tmp);
3369  }
3370 };
3371 
3372 template<int V>
3373 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3374  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3375  NLIB_STATIC_ASSERT(V > 3);
3376  return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3377  }
3378 };
3379 
3380 template<int V>
3381 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3382  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3383  NLIB_STATIC_ASSERT(V > 3);
3384  return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3385  }
3386 };
3387 
3388 template<int V>
3389 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3390  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3391  NLIB_STATIC_ASSERT(V > 3);
3392  return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3393  }
3394 };
3395 
3396 template<int V>
3397 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3398  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3399  NLIB_STATIC_ASSERT(V > 3);
3400  return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3401  }
3402 };
3403 
3404 template<int V>
3405 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3406  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3407  NLIB_STATIC_ASSERT(V < 4);
3408  return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3409  }
3410 };
3411 
3412 template<int V>
3413 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3414  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3415  NLIB_STATIC_ASSERT(V < 4);
3416  return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3417  }
3418 };
3419 
3420 template<int V>
3421 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3422  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3423  NLIB_STATIC_ASSERT(V < 4);
3424  return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3425  }
3426 };
3427 
3428 template<int V>
3429 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3430  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3431  NLIB_STATIC_ASSERT(V < 4);
3432  return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3433  }
3434 };
3435 
3436 template<bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3437 struct F128PermuteHelper {
3438  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3439  return F128PermuteHelper2<
3440  ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3441  ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3442  V0, V1, V2, V3>::Permute(a, b);
3443  }
3444 };
3445 
3446 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3447 
3448 template<int Z>
3449 float32x2_t F128PermuteGet64(f128arg a, f128arg b) NLIB_NOEXCEPT;
3450 
3451 template<>
3452 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3453  NLIB_UNUSED(b);
3454  return vget_low_f32(a);
3455 }
3456 template<>
3457 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3458  NLIB_UNUSED(b);
3459  return vget_high_f32(a);
3460 }
3461 template<>
3462 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3463  NLIB_UNUSED(a);
3464  return vget_low_f32(b);
3465 }
3466 template<>
3467 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3468  NLIB_UNUSED(a);
3469  return vget_high_f32(b);
3470 }
3471 
3472 template<int X0, int X1>
3473 struct F128PermuteHelper2 {
3474  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3475  float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3476  float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3477  return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3478  }
3479 };
3480 
3481 template<int X>
3482 struct F128PermuteHelper2<X, X> {
3483  static NLIB_ALWAYS_INLINE float32x2_t Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3484  float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3485  return vdup_lane_f32(x, (X & 1));
3486  }
3487 };
3488 
3489 template<bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3490 struct F128PermuteHelper {
3491  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3492  return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3493  F128PermuteHelper2<V2, V3>::Permute(a, b));
3494  }
3495 };
3496 
3497 template<>
3498 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3499  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3500  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3501  return vreinterpretq_f32_s32(tmp);
3502  }
3503 };
3504 
3505 template<>
3506 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3507  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3508  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3509  return vreinterpretq_f32_s32(tmp);
3510  }
3511 };
3512 
3513 template<>
3514 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3515  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3516  int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3517  return vreinterpretq_f32_s32(tmp);
3518  }
3519 };
3520 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE)
3521 template<int R0, int R1, int VAR0, int VAR1>
3522 struct F128PermuteHelper2 {
3523  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT;
3524 };
3525 
3526 template<int R0, int R1>
3527 struct F128PermuteHelper2<R0, R1, 0, 0> {
3528  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3529  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3530  }
3531 };
3532 
3533 template<int R0, int R1>
3534 struct F128PermuteHelper2<R0, R1, 0, 1> {
3535  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3536  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3537  }
3538 };
3539 
3540 template<int R0, int R1>
3541 struct F128PermuteHelper2<R0, R1, 0, 2> {
3542  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3543  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3544  }
3545 };
3546 
3547 template<int R0, int R1>
3548 struct F128PermuteHelper2<R0, R1, 0, 3> {
3549  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3550  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3551  }
3552 };
3553 
3554 template<int R0, int R1>
3555 struct F128PermuteHelper2<R0, R1, 1, 0> {
3556  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3557  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3558  }
3559 };
3560 
3561 template<int R0, int R1>
3562 struct F128PermuteHelper2<R0, R1, 1, 1> {
3563  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3564  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3565  }
3566 };
3567 
3568 template<int R0, int R1>
3569 struct F128PermuteHelper2<R0, R1, 1, 2> {
3570  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3571  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3572  }
3573 };
3574 
3575 template<int R0, int R1>
3576 struct F128PermuteHelper2<R0, R1, 1, 3> {
3577  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3578  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3579  }
3580 };
3581 
3582 template<int R0, int R1>
3583 struct F128PermuteHelper2<R0, R1, 2, 0> {
3584  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3585  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3586  }
3587 };
3588 
3589 template<int R0, int R1>
3590 struct F128PermuteHelper2<R0, R1, 2, 1> {
3591  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3592  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3593  }
3594 };
3595 
3596 template<int R0, int R1>
3597 struct F128PermuteHelper2<R0, R1, 2, 2> {
3598  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3599  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3600  }
3601 };
3602 
3603 template<int R0, int R1>
3604 struct F128PermuteHelper2<R0, R1, 2, 3> {
3605  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3606  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3607  }
3608 };
3609 
3610 template<int R0, int R1>
3611 struct F128PermuteHelper2<R0, R1, 3, 0> {
3612  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3613  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3614  }
3615 };
3616 
3617 template<int R0, int R1>
3618 struct F128PermuteHelper2<R0, R1, 3, 1> {
3619  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3620  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3621  }
3622 };
3623 
3624 template<int R0, int R1>
3625 struct F128PermuteHelper2<R0, R1, 3, 2> {
3626  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3627  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3628  }
3629 };
3630 
3631 template<int R0, int R1>
3632 struct F128PermuteHelper2<R0, R1, 3, 3> {
3633  static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3634  return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3635  }
3636 };
3637 
3638 template<bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3639 struct F128PermuteHelper {
3640  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3641  f128 ret;
3642  f32x2 x0 = a.vec.ps[0];
3643  f32x2 x1 = a.vec.ps[1];
3644  f32x2 x2 = b.vec.ps[0];
3645  f32x2 x3 = b.vec.ps[1];
3646  ret.vec.ps[0] =
3647  F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>::Permute(x0, x1, x2, x3);
3648  ret.vec.ps[1] =
3649  F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>::Permute(x0, x1, x2, x3);
3650  return ret;
3651  }
3652 };
3653 #else
3654 template<bool IsAllA, bool IsAllB, int V0, int V1, int V2, int V3>
3655 struct F128PermuteHelper {
3656  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3657  f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3658  F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3659  F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3660  F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3661  return ret;
3662  }
3663 };
3664 #endif
3665 
3666 template<int V0, int V1, int V2, int V3>
3667 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3668  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3669  NLIB_UNUSED(b);
3670  return F128::Swizzle<V0, V1, V2, V3>(a);
3671  }
3672 };
3673 
3674 template<int V0, int V1, int V2, int V3>
3675 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3676  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3677  NLIB_UNUSED(a);
3678  return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3679  }
3680 };
3681 
3682 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3683 // Permute specialization for SSE4.1
3684 template<>
3685 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3686  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3687  return _mm_unpacklo_ps(a, b);
3688  }
3689 };
3690 template<>
3691 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3692  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3693  return _mm_unpacklo_ps(b, a);
3694  }
3695 };
3696 template<>
3697 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3698  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3699  return _mm_unpackhi_ps(a, b);
3700  }
3701 };
3702 template<>
3703 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3704  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3705  return _mm_unpackhi_ps(b, a);
3706  }
3707 };
3708 #endif
3709 
3710 template<int V0, int V1, int V2, int V3>
3711 struct F128PermuteDontCareHelper {
3712  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3713  NLIB_STATIC_ASSERT(V0 < 8);
3714  NLIB_STATIC_ASSERT(V1 < 8);
3715  NLIB_STATIC_ASSERT(V2 < 8);
3716  NLIB_STATIC_ASSERT(V3 < 8);
3717  static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3718  static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3719  return detail::F128PermuteHelper<arg1, arg2, V0, V1, V2, V3>::Permute(a, b);
3720  }
3721 };
3722 
3723 template<int V1, int V2, int V3>
3724 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3725  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3726  NLIB_STATIC_ASSERT(V1 < 8);
3727  NLIB_STATIC_ASSERT(V2 < 8);
3728  NLIB_STATIC_ASSERT(V3 < 8);
3729  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3730  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3731  }
3732 };
3733 
3734 template<int V0, int V2, int V3>
3735 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3736  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3737  NLIB_STATIC_ASSERT(V0 < 8);
3738  NLIB_STATIC_ASSERT(V2 < 8);
3739  NLIB_STATIC_ASSERT(V3 < 8);
3740  static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3741  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3742  }
3743 };
3744 
3745 template<int V0, int V1, int V3>
3746 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3747  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3748  NLIB_STATIC_ASSERT(V0 < 8);
3749  NLIB_STATIC_ASSERT(V1 < 8);
3750  NLIB_STATIC_ASSERT(V3 < 8);
3751  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3752  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3753  }
3754 };
3755 
3756 template<int V0, int V1, int V2>
3757 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3758  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3759  NLIB_STATIC_ASSERT(V0 < 8);
3760  NLIB_STATIC_ASSERT(V1 < 8);
3761  NLIB_STATIC_ASSERT(V2 < 8);
3762  static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3763  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3764  }
3765 };
3766 
3767 template<int V2, int V3>
3768 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3769  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3770  NLIB_STATIC_ASSERT(V2 < 8);
3771  NLIB_STATIC_ASSERT(V3 < 8);
3772  static const int V0 = (V2 < 4) ? 0 : 4;
3773  return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3774  }
3775 };
3776 
3777 template<int V1, int V2>
3778 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3779  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3780  NLIB_STATIC_ASSERT(V1 < 8);
3781  NLIB_STATIC_ASSERT(V2 < 8);
3782  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3783  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3784  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3785  }
3786 };
3787 
3788 template<int V0, int V1>
3789 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3790  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3791  NLIB_STATIC_ASSERT(V0 < 8);
3792  NLIB_STATIC_ASSERT(V1 < 8);
3793  static const int V2 = (V1 < 4) ? 2 : 6;
3794  return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3795  }
3796 };
3797 
3798 template<int V0, int V3>
3799 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3800  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3801  NLIB_STATIC_ASSERT(V0 < 8);
3802  NLIB_STATIC_ASSERT(V3 < 8);
3803  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3804  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3805  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3806  }
3807 };
3808 
3809 template<int V0, int V2>
3810 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3811  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3812  NLIB_STATIC_ASSERT(V0 < 8);
3813  NLIB_STATIC_ASSERT(V2 < 8);
3814  static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3815  static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3816  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3817  }
3818 };
3819 
3820 template<int V1, int V3>
3821 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3822  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3823  NLIB_STATIC_ASSERT(V1 < 8);
3824  NLIB_STATIC_ASSERT(V3 < 8);
3825  static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3826  static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3827  return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3828  }
3829 };
3830 
3831 template<int V>
3832 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3833  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3834  NLIB_STATIC_ASSERT(V < 8);
3835  static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3836  static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3837  static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3838  return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3839  }
3840 };
3841 
3842 template<int V>
3843 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3844  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3845  NLIB_STATIC_ASSERT(V < 8);
3846  static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3847  static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3848  static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3849  return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3850  }
3851 };
3852 
3853 template<int V>
3854 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3855  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3856  NLIB_STATIC_ASSERT(V < 8);
3857  static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3858  static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3859  static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3860  return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3861  }
3862 };
3863 
3864 template<int V>
3865 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3866  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3867  NLIB_STATIC_ASSERT(V < 8);
3868  static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3869  static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3870  static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3871  return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3872  }
3873 };
3874 
3875 template<>
3876 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3877  static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3878  NLIB_UNUSED(b);
3879  return a;
3880  }
3881 };
3882 
3883 } // namespace detail
3884 
3885 template<int V0, int V1, int V2, int V3>
3886 // r[0] = V0 < 4 ? a[V0] : b[V0 - 4], .....
3887 NLIB_M(f128) F128::Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3888 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE)
3889  return __builtin_shufflevector(a, b, (V0 != 8 ? V0 : -1), (V1 != 8 ? V1 : -1),
3890  (V2 != 8 ? V2 : -1), (V3 != 8 ? V3 : -1));
3891 #else
3892  return detail::F128PermuteDontCareHelper < V0 != -1 ? V0 : 8, V1 != -1 ? V1 : 8,
3893  V2 != -1 ? V2 : 8, V3 != -1 ? V3 : 8 > ::Permute(a, b);
3894 #endif
3895 }
3896 
3897 template<bool SplatLane0, bool SplatLane1, bool SplatLane2, bool SplatLane3>
3898 // r[i] = InsertLane(i) ? insert_value[i] : value[i]
3899 // note that splat[0] = splat[1] = splat[2] = splat[3]
3900 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT {
3901 #if defined(NLIB_NEON)
3902  const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3903  const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3904  const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3905  const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3906 #else
3907  // SSE4.1 has _mm_blend_ps()
3908  const int v0 = SplatLane0 ? 4 : 0;
3909  const int v1 = SplatLane1 ? 5 : 1;
3910  const int v2 = SplatLane2 ? 6 : 2;
3911  const int v3 = SplatLane3 ? 7 : 3;
3912 #endif
3913  return F128::Permute<v0, v1, v2, v3>(value, splat);
3914 }
3915 
3916 NLIB_M2(f128) F128::Exp2(f128arg value) NLIB_NOEXCEPT {
3917 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
3918  f128 ret;
3919  ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3920  ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3921  ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3922  ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3923  return ret;
3924 #else
3925  i128 iround = F128::ConvertToI128Round(value);
3926  f128 fround = F128::ConvertFromI128(iround);
3927  f128 x = F128::Sub(value, fround);
3928  f128 xx = F128::Mult(x, x);
3929 
3930  f128 P = F128::LoadA16(F128::exp2_P_);
3931  f128 Q = F128::LoadA16(F128::exp2_Q_);
3932 
3933  f128 px;
3934  // px = P[0]
3935  // px = px * xx + P[1]
3936  // px = px * xx + P[2]
3937  // px = x * px;
3938  px = F128::MultAdd<0>(P, xx, F128::SetValue<1>(P, each_select32), each_select32);
3939  px = F128::MultAdd(px, xx, F128::SetValue<2>(P, each_select32));
3940  px = F128::Mult(x, px);
3941 
3942  f128 qx;
3943  // qx = xx + Q[0]
3944  // qx = qx * xx + Q[1]
3945  qx = F128::Add(xx, F128::SetValue<0>(Q, each_select32));
3946  qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q, each_select32));
3947 
3948  x = F128::Div(px, F128::Sub(qx, px));
3949  x = F128::MultAdd<3>(Q, x, F128::SetValue<2>(Q, each_select32), each_select32);
3950 
3951  // x = x * 2^iround
3952  iround = I128::Add32(iround, I128::SetValue(127, each_int32));
3953  iround = I128::ShiftLeftLogical32(iround, 23);
3954  x = F128::Mult(x, F128::CastFromI128(iround));
3955 
3956  // NOTE:
3957  // overflow not checked
3958  return x;
3959 #endif
3960 }
3961 
3962 NLIB_M(f128) F128::ExpE(f128arg value) NLIB_NOEXCEPT {
3963  static const float log2e = 1.44269504088896340736f;
3964  return Exp2(F128::Mult(log2e, value));
3965 }
3966 
3967 NLIB_M(f128) F128::SinH(f128arg value) NLIB_NOEXCEPT {
3968  static const float log2e = 1.44269504088896340736f;
3969  f128 neg_one = F128::SetValue(-1.f, each_float);
3970  f128 v0 = F128::MultAdd(log2e, value, neg_one);
3971  f128 v1 = F128::MultSub(log2e, value, neg_one);
3972  f128 e0 = Exp2(v0);
3973  f128 e1 = Exp2(v1);
3974  return F128::Sub(e0, e1);
3975 }
3976 
3977 NLIB_M(f128) F128::CosH(f128arg value) NLIB_NOEXCEPT {
3978  static const float log2e = 1.44269504088896340736f;
3979  f128 neg_one = F128::SetValue(-1.f, each_float);
3980  f128 v0 = F128::MultAdd(log2e, value, neg_one);
3981  f128 v1 = F128::MultSub(log2e, value, neg_one);
3982  f128 e0 = Exp2(v0);
3983  f128 e1 = Exp2(v1);
3984  return F128::Add(e0, e1);
3985 }
3986 
3987 NLIB_M(f128) F128::TanH(f128arg value) NLIB_NOEXCEPT {
3988  // 1 - 2 * (1 + expE(2x))
3989  f128 cvalue = F128::LoadA16(tanh_cvalue_);
3990  f128 e = F128::Mult<0>(cvalue, value, each_select32);
3991  e = F128::Exp2(e);
3992  f128 half = F128::SetValue<2>(cvalue, each_select32);
3993  e = F128::MultAdd(half, e, half);
3994  e = F128::Recp(e);
3995  return F128::Sub(F128::SetValue<1>(cvalue, each_select32), e);
3996 }
3997 
3998 NLIB_M2(f128) F128::Tan(f128arg value) NLIB_NOEXCEPT {
3999 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
4000  f128 ret;
4001  ret.vec.v[0] = tanf(value.vec.v[0]);
4002  ret.vec.v[1] = tanf(value.vec.v[1]);
4003  ret.vec.v[2] = tanf(value.vec.v[2]);
4004  ret.vec.v[3] = tanf(value.vec.v[3]);
4005  return ret;
4006 #else
4007  // Cody and Waite algorithm
4008  f128 C = F128::LoadA16(&F128::tan_c_[0]);
4009 
4010  // g = round(value / (pi/2))
4011  f128 g = F128::Round(F128::Mult<0>(C, value, each_select32));
4012  f128 nearx_axis;
4013  {
4014  i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U, each_uint32));
4015  i128 cmp = I128::CmpEq32(t0, I128::SetZero());
4016  nearx_axis = F128::CastFromI128(cmp);
4017  }
4018 
4019  // f = value - (pi/2) * g
4020  f128 f = F128::MultSub<1>(C, g, value, each_select32);
4021  f = F128::MultSub<2>(C, g, f, each_select32);
4022 
4023  f128 near_axis = F128::CmpNearEqZero(f, F128::SetValue<3>(C, each_select32));
4024 
4025  f128 P = F128::LoadA16(&F128::tan_p_[0]);
4026  f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4027 
4028  f128 ff = F128::Mult(f, f);
4029  f128 one = F128::SetValue<3>(P, each_select32);
4030 
4031  f128 p = F128::MultAdd<2>(P, ff, F128::SetValue<1>(P, each_select32), each_select32);
4032  p = F128::MultAdd(p, ff, F128::SetValue<0>(P, each_select32));
4033  p = F128::MultAdd(p, ff, one);
4034  p = F128::Mult(f, p);
4035 
4036  f128 q = F128::MultAdd<3>(Q, ff, F128::SetValue<2>(Q, each_select32), each_select32);
4037  q = F128::MultAdd(q, ff, F128::SetValue<1>(Q, each_select32));
4038  q = F128::MultAdd(q, ff, F128::SetValue<0>(Q, each_select32));
4039  q = F128::MultAdd(q, ff, one);
4040 
4041  p = F128::Select(near_axis, f, p);
4042  q = F128::Select(near_axis, one, q);
4043 
4044  f128 r0 = F128::Div(p, q);
4045  f128 r1 = F128::Negate(F128::Recp(r0));
4046 
4047  return F128::Select(nearx_axis, r0, r1);
4048 #endif
4049 }
4050 
4051 NLIB_M2(f128) F128::Log2(f128arg value) NLIB_NOEXCEPT {
4052 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE)
4053  static const float scale = 1.4426950408889634f; // 1 / LogE(2.0)
4054  f128 ret;
4055  ret.vec.v[0] = logf(value.vec.v[0]);
4056  ret.vec.v[1] = logf(value.vec.v[1]);
4057  ret.vec.v[2] = logf(value.vec.v[2]);
4058  ret.vec.v[3] = logf(value.vec.v[3]);
4059  return F128::Mult(scale, ret);
4060 #else
4061  // x = frexp(value, &e)
4062  f128 x = F128::And(F128::SetValue(0x807FFFFFU, each_uint32), value);
4063  x = F128::Or(F128::SetValue(127U << 23, each_uint32), x);
4064  i128 e = I128::And(I128::SetValue(0x7F800000U, each_uint32), F128::CastToI128(value));
4065  e = I128::ShiftRightLogical32(e, 23);
4066  e = I128::Sub32(e, I128::SetValue(127U, each_uint32));
4067 
4068  x = F128::Sub(x, F128::SetOne());
4069  f128 z = F128::Mult(x, x);
4070  f128 y;
4071 
4072  f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4073  f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4074  f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4075 
4076  f128 p = F128::SetValue<0>(pq0, each_select32);
4077  p = F128::MultAdd(p, x, F128::SetValue<1>(pq0, each_select32));
4078  p = F128::MultAdd(p, x, F128::SetValue<2>(pq0, each_select32));
4079  p = F128::MultAdd(p, x, F128::SetValue<3>(pq0, each_select32));
4080  p = F128::MultAdd(p, x, F128::SetValue<0>(pq1, each_select32));
4081  p = F128::MultAdd(p, x, F128::SetValue<1>(pq1, each_select32));
4082 
4083  f128 q = F128::Add(x, F128::SetValue<2>(pq1, each_select32));
4084  q = F128::MultAdd(q, x, F128::SetValue<3>(pq1, each_select32));
4085  q = F128::MultAdd(q, x, F128::SetValue<0>(pq2, each_select32));
4086  q = F128::MultAdd(q, x, F128::SetValue<1>(pq2, each_select32));
4087  q = F128::MultAdd(q, x, F128::SetValue<2>(pq2, each_select32));
4088 
4089  y = F128::Mult(z, p);
4090  y = F128::Div(y, q);
4091  y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4092 
4093  f128 result;
4094  {
4095  // do not optimize
4096  f128 log2ea = F128::SetValue<3>(pq2, each_select32);
4097  result = F128::Mult(y, log2ea);
4098  result = F128::MultAdd(log2ea, x, result);
4099  result = F128::Add(result, y);
4100  result = F128::Add(result, x);
4101  result = F128::Add(result, F128::ConvertFromI128(e));
4102  }
4103 
4104  {
4105  f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4106 
4107  // value is NaN -> NaN
4108  f128 is_nan = F128::IsNaN(value);
4109  f128 nan = F128::SetValue<0>(nan_inf, each_select32);
4110  result = F128::Select(is_nan, nan, result);
4111 
4112  f128 is_inf = F128::IsInfinite(value);
4113  f128 is_pos = F128::CmpGtZero(value);
4114 
4115  // value == inf -> +inf
4116  f128 inf = F128::SetValue<1>(nan_inf, each_select32);
4117  f128 is_pos_inf = F128::And(is_inf, is_pos);
4118  result = F128::Select(is_pos_inf, inf, result);
4119 
4120  // value == 0 -> -inf
4121  f128 neg_inf = F128::SetValue<3>(nan_inf, each_select32);
4122  f128 is_zero = F128::CmpEqZero(value);
4123  result = F128::Select(is_zero, neg_inf, result);
4124 
4125  // value < 0 -> -NaN
4126  f128 neg_nan = F128::SetValue<2>(nan_inf, each_select32);
4127  f128 is_neg = F128::CmpLtZero(value);
4128  result = F128::Select(is_neg, neg_nan, result);
4129 
4130  // otherwise -> Log2(value)
4131  }
4132 
4133  return result;
4134 #endif
4135 }
4136 
4137 NLIB_M(f128) F128::LogE(f128arg value) NLIB_NOEXCEPT {
4138 #ifdef NLIB_F128_SIMD_NOUSE
4139  f128 ret;
4140  ret.vec.v[0] = logf(value.vec.v[0]);
4141  ret.vec.v[1] = logf(value.vec.v[1]);
4142  ret.vec.v[2] = logf(value.vec.v[2]);
4143  ret.vec.v[3] = logf(value.vec.v[3]);
4144  return ret;
4145 #else
4146  f128 x = F128::Log2(value);
4147  static const float recp_log2e = 0.6931471805597018f;
4148  return F128::Mult(recp_log2e, x);
4149 #endif
4150 }
4151 
4152 #undef NLIB_M
4153 #undef NLIB_M2
4154 #endif // NLIB_DOXYGEN
4155 
4160 typedef f128 SimdPlane;
4164 
4165 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4166 struct NLIB_ALIGNAS(16) SimdMatrix {
4167 #else
4168 struct SimdMatrix {
4169 #endif
4170  public:
4172  SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) NLIB_NOEXCEPT {
4173  r[0] = r0;
4174  r[1] = r1;
4175  r[2] = r2;
4176  r[3] = r3;
4177  }
4178  SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11, float m12,
4179  float m13, float m20, float m21, float m22, float m23, float m30, float m31,
4180  float m32, float m33) NLIB_NOEXCEPT;
4181  explicit SimdMatrix(const float* p) NLIB_NOEXCEPT;
4182 
4183  public:
4184  f128 r[4];
4185 }; // namespace simd
4186 
4187 inline SimdMatrix::SimdMatrix(float m00, float m01, float m02, float m03, float m10, float m11,
4188  float m12, float m13, float m20, float m21, float m22, float m23,
4189  float m30, float m31, float m32, float m33) NLIB_NOEXCEPT {
4190  r[0] = F128::SetValue(m00, m01, m02, m03);
4191  r[1] = F128::SetValue(m10, m11, m12, m13);
4192  r[2] = F128::SetValue(m20, m21, m22, m23);
4193  r[3] = F128::SetValue(m30, m31, m32, m33);
4194 }
4195 
4196 inline SimdMatrix::SimdMatrix(const float* p) NLIB_NOEXCEPT {
4197  uintptr_t algn = reinterpret_cast<uintptr_t>(p) & 15;
4198  NLIB_ASSERT((algn & 3) == 0);
4199  switch (algn >> 2) {
4200  case 0:
4201  r[0] = F128::LoadA16(p);
4202  r[1] = F128::LoadA16(p + 4);
4203  r[2] = F128::LoadA16(p + 8);
4204  r[3] = F128::LoadA16(p + 12);
4205  break;
4206  case 1:
4207  r[0] = F128::LoadA4(p);
4208  r[1] = F128::LoadA4(p + 4);
4209  r[2] = F128::LoadA4(p + 8);
4210  r[3] = F128::LoadA4(p + 12);
4211  break;
4212  case 2:
4213  r[0] = F128::LoadA8(p);
4214  r[1] = F128::LoadA8(p + 4);
4215  r[2] = F128::LoadA8(p + 8);
4216  r[3] = F128::LoadA8(p + 12);
4217  break;
4218  default:
4219  NLIB_ASSUME(0);
4220  break;
4221  }
4222 }
4223 
4224 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
4225 typedef const SimdMatrix& SimdMatrixArg;
4226 #else
4227 typedef const SimdMatrix SimdMatrixArg;
4228 #endif
4229 
4230 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE)
4231 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4232  { \
4233  f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \
4234  f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \
4235  f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \
4236  f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \
4237  (row0) = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \
4238  (row1) = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \
4239  (row2) = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \
4240  (row3) = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \
4241  }
4242 #elif defined(NLIB_NEON)
4243 #ifdef __aarch64__
4244 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4245  { \
4246  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4247  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4248  uint64x2_t row0_, row1_, row2_, row3_; \
4249  row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4250  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4251  row0 = vreinterpretq_f32_u64(row0_); \
4252  row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4253  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4254  row1 = vreinterpretq_f32_u64(row1_); \
4255  row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \
4256  vreinterpretq_u64_f32(trn_f1_.val[0])); \
4257  row2 = vreinterpretq_f32_u64(row2_); \
4258  row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \
4259  vreinterpretq_u64_f32(trn_f1_.val[1])); \
4260  row3 = vreinterpretq_f32_u64(row3_); \
4261  }
4262 #else
4263 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4264  { \
4265  float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4266  float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4267  row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \
4268  row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \
4269  row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \
4270  row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \
4271  }
4272 #endif
4273 #elif defined(CAFE)
4274 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4275  { \
4276  f32x2 tmp0, tmp1; \
4277  tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \
4278  tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \
4279  row0.vec.ps[0] = tmp0; \
4280  row1.vec.ps[0] = tmp1; \
4281  tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \
4282  tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \
4283  row2.vec.ps[1] = tmp0; \
4284  row3.vec.ps[1] = tmp1; \
4285  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4286  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4287  row0.vec.ps[1] = row2.vec.ps[0]; \
4288  row1.vec.ps[1] = row3.vec.ps[0]; \
4289  row2.vec.ps[0] = tmp0; \
4290  row3.vec.ps[0] = tmp1; \
4291  tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4292  tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4293  row0.vec.ps[1] = tmp0; \
4294  row1.vec.ps[1] = tmp1; \
4295  }
4296 #endif
4297 
4299  float x;
4300  float y;
4301  float z;
4302 };
4303 
4305  float x;
4306  float y;
4307  float z;
4308  float w;
4309 };
4310 
4311 struct NLIB_VIS_PUBLIC Float2x3 {
4312  float m[2][3];
4313 };
4314 
4316  float m[3][3];
4317 };
4318 
4319 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4320 struct NLIB_ALIGNAS(16) Float3x4 {
4321 #else
4322 struct Float3x4 {
4323 #endif
4324  float m[3][4];
4325 };
4326 
4327 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4328 struct NLIB_ALIGNAS(16) Float4x3 {
4329 #else
4330 struct Float4x3 {
4331 #endif
4332  float m[4][3];
4333 };
4334 
4335 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4336 struct NLIB_ALIGNAS(16) Float4x4 {
4337 #else
4338 struct Float4x4 {
4339 #endif
4340  float m[4][4];
4341 };
4342 
4343 } // namespace simd
4344 NLIB_NAMESPACE_END
4345 
4346 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
float x
The x-coordinate of the 3D vector.
Definition: SimdFloat.h:4299
SimdMatrix()
Instantiates the object with default parameters (default constructor).
Definition: SimdFloat.h:4171
The class with the collection of functions that handle 4x4 matrices.
Definition: SimdMatrix.h:28
#define NLIB_ALWAYS_INLINE
Indicates that the compiler is forced to perform inline expansion of functions.
Definition: Platform_unix.h:95
Class representing the view frustum.
Definition: SimdGeometry.h:117
The class with the collection of functions that handle quaternions.
f128arg SimdVectorArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4157
The type for two SIMD registers for 128-bit, single-precision, floating-point numbers.
Definition: SimdFloat.h:50
float x
The x-coordinate for the 4D vector.
Definition: SimdFloat.h:4305
float y
The y-coordinate of the 4D vector.
Definition: SimdFloat.h:4306
Implements the class and functions for SIMD computations on integers.
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
Sets up the matrix from the parameters.
Definition: SimdFloat.h:4172
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
Definition: SimdInt.h:57
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:86
f128arg SimdSphereArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4163
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:87
Class for representing oriented bounding boxes (OBB). This class has data members to hold the center ...
Definition: SimdGeometry.h:92
static f128 ShiftRight(f128arg a, f128arg b) noexcept
Sets the elements of b in shifted order to the portion of a that becomes empty when a is shifted to t...
Definition: SimdFloat.h:350
The tag for representing a single-precision floating-point number with an empty structure.
Definition: SimdFloat.h:73
#define NLIB_ASSUME(cond)
Indicates that cond is true and provides tips for optimizing the compiler.
Definition: Platform.h:259
The class with the collection of functions that determine containment relations.
Definition: SimdGeometry.h:283
The class with the collection of static member functions that handle spheres in three-dimensional spa...
Definition: SimdGeometry.h:52
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
Definition: SimdFloat.h:74
f128arg SimdQuaternionArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4159
nlib_i128_t i128
nlib_i128_t is defined using typedef.
Definition: SimdInt.h:74
The class with the collection of functions that handle planes in three-dimensional space...
Definition: SimdGeometry.h:34
f128arg SimdPlaneArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4161
The class with the collection of functions that perform calculations on three-dimensional vectors...
Definition: SimdVector3.h:26
static f128 RotateLeft(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the left by the amount of N...
Definition: SimdFloat.h:337
The class with the collection of functions that perform square-of-distance calculations.
Definition: SimdGeometry.h:146
The type for reading and writing four-dimensional vectors in memory. Keeps float-type x...
Definition: SimdFloat.h:4304
const f128 f128arg
const f128 or const f128& is defined using typedef.
Definition: SimdFloat.h:84
The structure for keeping a 4x4 matrix.
Definition: SimdFloat.h:4168
float z
The z-coordinate of the 4D vector.
Definition: SimdFloat.h:4307
nlib_f128x2_t f128x2
nlib_f128x2_t is defined using typedef.
Definition: SimdFloat.h:79
f128 SimdSphere
f128 is defined using typedef. Used when handling spheres.
Definition: SimdFloat.h:4162
The class for single-precision floating point SIMD computations using128-bit registers (MM0-XMM15 for...
Definition: SimdFloat.h:97
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
Definition: SimdInt.h:53
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:109
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
Definition: Config.h:111
A file that contains the configuration information for each development environment.
The class with the collection of functions that perform calculations on four-dimensional vectors...
Definition: SimdVector4.h:24
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
Definition: SimdFloat.h:4298
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
Definition: Config.h:260
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
Definition: SimdInt.h:47
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:63
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
Definition: SimdFloat.h:4330
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
Definition: SimdFloat.h:4315
The tag for representing an unsigned 32-bit integer with an empty structure.
Definition: SimdInt.h:43
static f128 RotateRight(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the right by the amount of N...
Definition: SimdFloat.h:344
float y
The y-coordinate of the 3D vector.
Definition: SimdFloat.h:4300
nlib_f128_t f128
nlib_f128_t is defined using typedef.
Definition: SimdFloat.h:77
float z
The z-coordinate of the 3D vector.
Definition: SimdFloat.h:4301
Class for representing axis-aligned bounding boxes (AABB). The class has data members to hold the min...
Definition: SimdGeometry.h:73
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
Definition: SimdInt.h:49
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Definition: Config.h:174
float w
The w-coordinate of the 4D vector.
Definition: SimdFloat.h:4308
The class with the collection of functions that determine intersections.
Definition: SimdGeometry.h:193
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
Definition: SimdFloat.h:4158
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
Definition: SimdFloat.h:4338
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
Definition: SimdFloat.h:4322
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
Definition: SimdFloat.h:4160
__m128 nlib_f128_t
The type for a SIMD register for 128-bit, single-precision, floating-point numbers.
Definition: SimdFloat.h:49
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...
Definition: SimdFloat.h:4156