3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
7 # ifndef __USE_C99_MATH
8 # define __USE_C99_MATH
17 #if !defined(NLIB_SIMD) && !defined(NLIB_CAFE_PPC)
18 #define NLIB_F128_SIMD_NOUSE
21 #ifdef NLIB_F128_SIMD_NOUSE
29 #elif defined(NLIB_SSE41)
32 #elif defined(NLIB_NEON)
35 #elif defined(NLIB_CAFE_PPC)
58 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
64 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
65 typedef const f128& f128arg_ex;
67 typedef const f128 f128arg_ex;
70 #if !defined(_MSC_VER) || _MSC_VER < 1800
80 static f128 __vectorcall SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT;
85 static f128 __vectorcall SetZeroToLane(f128arg value)
NLIB_NOEXCEPT;
93 static f128 __vectorcall LoadA16(
const float* p)
NLIB_NOEXCEPT;
94 static f128 __vectorcall LoadA8(
const float* p)
NLIB_NOEXCEPT;
95 static f128 __vectorcall LoadA4(
const float* p)
NLIB_NOEXCEPT;
103 static void __vectorcall StoreA16(
float* p, f128arg value)
NLIB_NOEXCEPT;
104 static void __vectorcall StoreA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
105 static void __vectorcall StoreA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
106 static void __vectorcall StoreA16(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
107 static void __vectorcall StoreA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
108 static void __vectorcall StoreA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
109 static void __vectorcall StoreA16(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
110 static void __vectorcall StoreA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
111 static void __vectorcall StoreA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
113 static void __vectorcall StoreLoA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
114 static void __vectorcall StoreLoA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
115 static void __vectorcall StoreLoA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
116 static void __vectorcall StoreLoA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
117 static void __vectorcall StoreLoA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
118 static void __vectorcall StoreLoA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
120 static void __vectorcall StoreHiA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
121 static void __vectorcall StoreHiA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
122 static void __vectorcall StoreHiA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
123 static void __vectorcall StoreHiA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
124 static void __vectorcall StoreHiA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
125 static void __vectorcall StoreHiA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
127 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(NLIB_CAFE_PPC)
136 static f128 __vectorcall Add(f128arg a, f128arg b)
NLIB_NOEXCEPT;
137 static f128 __vectorcall Sub(f128arg a, f128arg b)
NLIB_NOEXCEPT;
138 static f128 __vectorcall Mult(f128arg a, f128arg b)
NLIB_NOEXCEPT;
139 static f128 __vectorcall Mult(
float a, f128arg b)
NLIB_NOEXCEPT;
142 static f128 __vectorcall Div(f128arg a, f128arg b)
NLIB_NOEXCEPT;
143 static f128 __vectorcall Negate(f128arg value)
NLIB_NOEXCEPT;
144 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
145 static f128 __vectorcall NegateEx(f128arg value)
NLIB_NOEXCEPT;
146 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
147 static f128 __vectorcall MultAdd(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
149 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
151 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
152 static f128 __vectorcall MultSub(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
154 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
156 static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b)
NLIB_NOEXCEPT;
158 static f128 __vectorcall AbsDiff(f128arg a, f128arg b)
NLIB_NOEXCEPT;
164 static f128 __vectorcall Max(f128arg a, f128arg b)
NLIB_NOEXCEPT;
165 static f128 __vectorcall Min(f128arg a, f128arg b)
NLIB_NOEXCEPT;
166 static f128 __vectorcall PairwiseMax(f128arg a, f128arg b)
NLIB_NOEXCEPT;
167 static f128 __vectorcall PairwiseMin(f128arg a, f128arg b)
NLIB_NOEXCEPT;
168 static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max)
NLIB_NOEXCEPT;
169 static f128 __vectorcall Saturate(f128arg value)
NLIB_NOEXCEPT;
176 static f128 __vectorcall RecpEst(f128arg value)
NLIB_NOEXCEPT;
178 static f128 __vectorcall SqrtEst(f128arg value)
NLIB_NOEXCEPT;
179 static f128 __vectorcall RecpSqrt(f128arg value)
NLIB_NOEXCEPT;
180 static f128 __vectorcall RecpSqrtEst(f128arg value)
NLIB_NOEXCEPT;
187 static f128 __vectorcall Truncate(f128arg value)
NLIB_NOEXCEPT;
195 static f128 __vectorcall And(f128arg a, f128arg b)
NLIB_NOEXCEPT;
196 static f128 __vectorcall Or(f128arg a, f128arg b)
NLIB_NOEXCEPT;
197 static f128 __vectorcall Xor(f128arg a, f128arg b)
NLIB_NOEXCEPT;
199 static f128 __vectorcall AndNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
200 static f128 __vectorcall OrNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
206 static f128 __vectorcall CmpEq(f128arg a, f128arg b)
NLIB_NOEXCEPT;
207 static f128 __vectorcall CmpLt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
208 static f128 __vectorcall CmpLe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
209 static f128 __vectorcall CmpGt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
210 static f128 __vectorcall CmpGe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
211 static f128 __vectorcall CmpNe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
212 static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps)
NLIB_NOEXCEPT;
213 static f128 __vectorcall InBound(f128arg value, f128arg bounds)
NLIB_NOEXCEPT;
218 static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
219 static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
220 static f128 __vectorcall ModAngle(f128arg value)
NLIB_NOEXCEPT;
223 static f128x2 __vectorcall SinCos(f128arg value)
NLIB_NOEXCEPT;
228 static f128 __vectorcall ArcSin(f128arg value)
NLIB_NOEXCEPT;
229 static f128 __vectorcall ArcCos(f128arg value)
NLIB_NOEXCEPT;
230 static f128 __vectorcall ArcTan(f128arg value)
NLIB_NOEXCEPT;
231 static f128 __vectorcall ArcTan2(f128arg y, f128arg x)
NLIB_NOEXCEPT;
238 static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t)
NLIB_NOEXCEPT;
239 static f128 __vectorcall
240 Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t)
NLIB_NOEXCEPT;
241 static f128 __vectorcall
242 CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t)
NLIB_NOEXCEPT;
243 static f128 __vectorcall
244 BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g)
NLIB_NOEXCEPT;
258 static int __vectorcall MoveMask(f128arg value)
NLIB_NOEXCEPT;
259 static bool __vectorcall IsAllMaskFalse(f128arg value)
NLIB_NOEXCEPT;
260 static bool __vectorcall IsAllMaskTrue(f128arg value)
NLIB_NOEXCEPT;
261 static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b)
NLIB_NOEXCEPT;
263 static f128 __vectorcall IsInfinite(f128arg value)
NLIB_NOEXCEPT;
270 static float __vectorcall GetFloatFromLane(f128arg value)
NLIB_NOEXCEPT;
272 static uint32_t __vectorcall GetUint32FromLane(f128arg value)
NLIB_NOEXCEPT;
273 static float __vectorcall GetFloatByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
274 static uint32_t __vectorcall GetUint32ByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
277 static f128 __vectorcall SetFloatToLane(f128arg value,
float v)
NLIB_NOEXCEPT;
278 static f128 __vectorcall SetFloatByIndex(f128arg value,
float v,
size_t i)
NLIB_NOEXCEPT;
284 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
285 static f128 __vectorcall Swizzle(f128arg value)
NLIB_NOEXCEPT;
286 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
287 static f128 __vectorcall Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT;
288 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
289 static f128 __vectorcall Splat(f128arg value, f128arg splat)
NLIB_NOEXCEPT;
293 static f128 __vectorcall
RotateLeft(f128arg value) NLIB_NOEXCEPT {
295 const size_t NN = 4 - N;
296 return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
300 static f128 __vectorcall
RotateRight(f128arg value) NLIB_NOEXCEPT {
302 return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
306 static f128 __vectorcall
ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
308 return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
364 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall
365 #define NLIB_M2(tp) inline tp __vectorcall
369 #ifdef NLIB_F128_SIMD_NOUSE
376 #elif defined(NLIB_SSE41)
377 return _mm_set1_ps(v);
378 #elif defined(NLIB_NEON)
379 return vdupq_n_f32(v);
380 #elif defined(NLIB_CAFE_PPC)
382 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
388 NLIB_M(f128) F128::SetValue(uint32_t v, each_uint32_tag) NLIB_NOEXCEPT {
389 #ifdef NLIB_F128_SIMD_NOUSE
396 #elif defined(NLIB_SSE41)
402 return _mm_set1_ps(tmp.f32);
403 #elif defined(NLIB_NEON)
404 uint32x4_t tmp = vdupq_n_u32(v);
405 return vreinterpretq_f32_u32(tmp);
406 #elif defined(NLIB_CAFE_PPC)
413 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
419 NLIB_M(f128) F128::SetValue(
float a,
float b,
float c,
float d) NLIB_NOEXCEPT {
420 #ifdef NLIB_F128_SIMD_NOUSE
427 #elif defined(NLIB_SSE41)
428 return _mm_set_ps(d, c, b, a);
429 #elif defined(NLIB_NEON)
438 return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
439 #elif defined(NLIB_CAFE_PPC)
441 ret.vec.ps[0][0] = a;
442 ret.vec.ps[0][1] = b;
443 ret.vec.ps[1][0] = c;
444 ret.vec.ps[1][1] = d;
451 NLIB_M(f128) F128::SetValue(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
453 #ifdef NLIB_F128_SIMD_NOUSE
455 ret.vec.v[0] = value.vec.v[N];
456 ret.vec.v[1] = value.vec.v[N];
457 ret.vec.v[2] = value.vec.v[N];
458 ret.vec.v[3] = value.vec.v[N];
460 #elif defined(NLIB_SSE41)
461 return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
462 #elif defined(NLIB_NEON)
463 float32x2_t tmp = vget_low_f32(value);
464 return vdupq_lane_f32(tmp, N);
468 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
470 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
471 float32x2_t tmp = vget_high_f32(value);
472 return vdupq_lane_f32(tmp, 0);
475 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
476 float32x2_t tmp = vget_high_f32(value);
477 return vdupq_lane_f32(tmp, 1);
479 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
481 NLIB_M(f128) F128::SetValue<0>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
483 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
487 NLIB_M(f128) F128::SetValue<1>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
489 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
493 NLIB_M(f128) F128::SetValue<2>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
495 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
499 NLIB_M(f128) F128::SetValue<3>(f128arg value, each_select32_tag) NLIB_NOEXCEPT {
501 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
507 NLIB_M(f128) F128::SetZero() NLIB_NOEXCEPT {
508 #ifdef NLIB_F128_SIMD_NOUSE
515 #elif defined(NLIB_SSE41)
516 return _mm_setzero_ps();
517 #elif defined(NLIB_NEON)
518 return vdupq_n_f32(0);
519 #elif defined(NLIB_CAFE_PPC)
521 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
528 NLIB_M(f128) F128::SetZeroToLane(f128arg value) NLIB_NOEXCEPT {
530 #ifdef NLIB_F128_SIMD_NOUSE
534 #elif defined(NLIB_SSE41)
535 return _mm_insert_ps(value, value, 1 << N);
536 #elif defined(NLIB_NEON)
537 return vsetq_lane_f32(0.f, value, N);
538 #elif defined(NLIB_CAFE_PPC)
540 ret.vec.ps[N / 2][N % 2] = 0.f;
546 NLIB_M(f128) F128::SetOne() NLIB_NOEXCEPT {
547 return F128::SetValue(1.f, each_float);
551 NLIB_M(f128) F128::SetNegativeOne() NLIB_NOEXCEPT {
552 return F128::SetValue(-1.f, each_float);
556 NLIB_M(f128) F128::SetEpsilon() NLIB_NOEXCEPT {
557 return F128::SetValue(1.0e-7f, each_float);
561 NLIB_M(f128) F128::SetInfinity() NLIB_NOEXCEPT {
566 NLIB_M(f128) F128::SetNaN() NLIB_NOEXCEPT {
571 NLIB_M(f128) F128::SetSignMask() NLIB_NOEXCEPT {
572 return F128::SetValue(-0.f, each_float);
576 NLIB_M(f128) F128::LoadA16(const
float* p) NLIB_NOEXCEPT {
577 #ifdef NLIB_F128_SIMD_NOUSE
584 #elif defined(NLIB_SSE41)
585 return _mm_load_ps(p);
586 #elif defined(NLIB_NEON)
587 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
588 uint64x2_t val = vld1q_u64(tmp);
589 return vreinterpretq_f32_u64(val);
590 #elif defined(NLIB_CAFE_PPC)
592 ret.vec.ps[0][0] = p[0];
593 ret.vec.ps[0][1] = p[1];
594 ret.vec.ps[1][0] = p[2];
595 ret.vec.ps[1][1] = p[3];
601 NLIB_M(f128) F128::LoadA4(const
float* p) NLIB_NOEXCEPT {
602 #ifdef NLIB_F128_SIMD_NOUSE
604 #elif defined(NLIB_SSE41)
605 return _mm_loadu_ps(p);
606 #elif defined(NLIB_NEON)
608 #elif defined(NLIB_CAFE_PPC)
610 ret.vec.ps[0][0] = p[0];
611 ret.vec.ps[0][1] = p[1];
612 ret.vec.ps[1][0] = p[2];
613 ret.vec.ps[1][1] = p[3];
619 NLIB_M(f128) F128::LoadA8(const
float* p) NLIB_NOEXCEPT {
620 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
621 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
622 uint64x2_t val = vld1q_u64(tmp);
623 return vreinterpretq_f32_u64(val);
630 NLIB_M(f128) F128::LoadA16(uintptr_t p) NLIB_NOEXCEPT {
631 return LoadA16(reinterpret_cast<const float*>(p));
635 NLIB_M(f128) F128::LoadA8(uintptr_t p) NLIB_NOEXCEPT {
636 return LoadA8(reinterpret_cast<const float*>(p));
640 NLIB_M(f128) F128::LoadA4(uintptr_t p) NLIB_NOEXCEPT {
641 return LoadA4(reinterpret_cast<const float*>(p));
645 NLIB_M(f128) F128::LoadA16(intptr_t p) NLIB_NOEXCEPT {
646 return LoadA16(reinterpret_cast<const float*>(p));
650 NLIB_M(f128) F128::LoadA8(intptr_t p) NLIB_NOEXCEPT {
651 return LoadA8(reinterpret_cast<const float*>(p));
655 NLIB_M(f128) F128::LoadA4(intptr_t p) NLIB_NOEXCEPT {
656 return LoadA4(reinterpret_cast<const float*>(p));
660 NLIB_M(
void) F128::StoreA16(
float* p, f128arg value) NLIB_NOEXCEPT {
661 #ifdef NLIB_F128_SIMD_NOUSE
662 p[0] = value.vec.v[0];
663 p[1] = value.vec.v[1];
664 p[2] = value.vec.v[2];
665 p[3] = value.vec.v[3];
666 #elif defined(NLIB_SSE41)
667 _mm_store_ps(p, value);
668 #elif defined(NLIB_NEON)
669 uint64x2_t tmp = vreinterpretq_u64_f32(value);
670 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
671 #elif defined(NLIB_CAFE_PPC)
672 p[0] = value.vec.ps[0][0];
673 p[1] = value.vec.ps[0][1];
674 p[2] = value.vec.ps[1][0];
675 p[3] = value.vec.ps[1][1];
680 NLIB_M(
void) F128::StoreA4(
float* p, f128arg value) NLIB_NOEXCEPT {
681 #ifdef NLIB_F128_SIMD_NOUSE
683 #elif defined(NLIB_SSE41)
684 _mm_storeu_ps(p, value);
685 #elif defined(NLIB_NEON)
687 #elif defined(NLIB_CAFE_PPC)
688 p[0] = value.vec.ps[0][0];
689 p[1] = value.vec.ps[0][1];
690 p[2] = value.vec.ps[1][0];
691 p[3] = value.vec.ps[1][1];
696 NLIB_M(
void) F128::StoreA8(
float* p, f128arg value) NLIB_NOEXCEPT {
697 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
698 uint64x2_t tmp = vreinterpretq_u64_f32(value);
699 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
706 NLIB_M(
void) F128::StoreA16(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
707 StoreA16(reinterpret_cast<float*>(p), value);
711 NLIB_M(
void) F128::StoreA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
712 StoreA8(reinterpret_cast<float*>(p), value);
716 NLIB_M(
void) F128::StoreA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
717 StoreA4(reinterpret_cast<float*>(p), value);
721 NLIB_M(
void) F128::StoreA16(intptr_t p, f128arg value) NLIB_NOEXCEPT {
722 StoreA16(reinterpret_cast<float*>(p), value);
726 NLIB_M(
void) F128::StoreA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
727 StoreA8(reinterpret_cast<float*>(p), value);
731 NLIB_M(
void) F128::StoreA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
732 StoreA4(reinterpret_cast<float*>(p), value);
736 NLIB_M(
void) F128::StoreLoA8(
float* p, f128arg value) NLIB_NOEXCEPT {
737 #ifdef NLIB_F128_SIMD_NOUSE
738 p[0] = value.vec.v[0];
739 p[1] = value.vec.v[1];
740 #elif defined(NLIB_SSE41)
741 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
742 #elif defined(NLIB_NEON)
743 uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
744 vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
745 #elif defined(NLIB_CAFE_PPC)
746 p[0] = value.vec.ps[0][0];
747 p[1] = value.vec.ps[0][1];
752 NLIB_M(
void) F128::StoreLoA4(
float* p, f128arg value) NLIB_NOEXCEPT { StoreLoA8(p, value); }
755 NLIB_M(
void) F128::StoreLoA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
756 StoreLoA8(reinterpret_cast<float*>(p), value);
760 NLIB_M(
void) F128::StoreLoA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
761 StoreLoA4(reinterpret_cast<float*>(p), value);
765 NLIB_M(
void) F128::StoreLoA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
766 StoreLoA8(reinterpret_cast<float*>(p), value);
770 NLIB_M(
void) F128::StoreLoA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
771 StoreLoA4(reinterpret_cast<float*>(p), value);
775 NLIB_M(
void) F128::StoreHiA8(
float* p, f128arg value) NLIB_NOEXCEPT {
776 #ifdef NLIB_F128_SIMD_NOUSE
777 p[0] = value.vec.v[2];
778 p[1] = value.vec.v[3];
779 #elif defined(NLIB_SSE41)
780 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
781 #elif defined(NLIB_NEON)
782 vst1_f32(p, vget_high_f32(value));
783 #elif defined(NLIB_CAFE_PPC)
784 p[0] = value.vec.ps[1][0];
785 p[1] = value.vec.ps[1][1];
790 NLIB_M(
void) F128::StoreHiA4(
float* p, f128arg value) NLIB_NOEXCEPT { StoreHiA8(p, value); }
793 NLIB_M(
void) F128::StoreHiA8(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
794 StoreHiA8(reinterpret_cast<float*>(p), value);
798 NLIB_M(
void) F128::StoreHiA4(uintptr_t p, f128arg value) NLIB_NOEXCEPT {
799 StoreHiA4(reinterpret_cast<float*>(p), value);
803 NLIB_M(
void) F128::StoreHiA8(intptr_t p, f128arg value) NLIB_NOEXCEPT {
804 StoreHiA8(reinterpret_cast<float*>(p), value);
808 NLIB_M(
void) F128::StoreHiA4(intptr_t p, f128arg value) NLIB_NOEXCEPT {
809 StoreHiA4(reinterpret_cast<float*>(p), value);
813 NLIB_M(f128) F128::Abs(f128arg value) NLIB_NOEXCEPT {
814 #ifdef NLIB_F128_SIMD_NOUSE
816 ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
817 ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
818 ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
819 ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
821 #elif defined(NLIB_NEON)
822 return vabsq_f32(value);
823 #elif defined(NLIB_SSE41)
824 const __m128 signmask = _mm_set1_ps(-0.0f);
825 return _mm_andnot_ps(signmask, value);
826 #elif defined(NLIB_CAFE_PPC)
828 ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
829 ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
835 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b) NLIB_NOEXCEPT {
836 #ifdef NLIB_F128_SIMD_NOUSE
838 result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
839 result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
840 result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
841 result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
843 #elif defined(NLIB_SSE41)
844 return _mm_blendv_ps(b, a, mask);
845 #elif defined(NLIB_NEON)
846 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
847 #elif defined(NLIB_CAFE_PPC)
850 mask_.vec.u[0] &= 0xFF7FFFFFUL;
851 mask_.vec.u[1] &= 0xFF7FFFFFUL;
852 mask_.vec.u[2] &= 0xFF7FFFFFUL;
853 mask_.vec.u[3] &= 0xFF7FFFFFUL;
856 ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
857 ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
862 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(NLIB_CAFE_PPC)
864 NLIB_M(f128) F128::ConvertFromI128(
i128 value) NLIB_NOEXCEPT {
865 #if defined(NLIB_SSE41)
866 return _mm_cvtepi32_ps(value);
867 #elif defined(NLIB_NEON)
868 return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
873 NLIB_M(f128) F128::CastFromI128(
i128 value) NLIB_NOEXCEPT {
874 #if defined(NLIB_SSE41)
875 return _mm_castsi128_ps(value);
876 #elif defined(NLIB_NEON)
877 return vreinterpretq_f32_s8(value);
882 NLIB_M(
i128) F128::ConvertToI128Round(f128 value) NLIB_NOEXCEPT {
883 #if defined(NLIB_SSE41)
884 return _mm_cvtps_epi32(value);
885 #elif defined(NLIB_NEON)
886 uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
887 uint32x4_t sgn = vdupq_n_u32(0x80000000U);
888 uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
889 w = vorrq_u32(w, half);
890 return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
894 NLIB_M(
i128) F128::ConvertToI128Truncate(f128 value) NLIB_NOEXCEPT {
895 #if defined(NLIB_SSE41)
896 return _mm_cvttps_epi32(value);
897 #elif defined(NLIB_NEON)
898 return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
903 NLIB_M(
i128) F128::CastToI128(f128 value) NLIB_NOEXCEPT {
904 #if defined(NLIB_SSE41)
905 return _mm_castps_si128(value);
906 #elif defined(NLIB_NEON)
907 return vreinterpretq_s8_f32(value);
913 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b) NLIB_NOEXCEPT {
914 #ifdef NLIB_F128_SIMD_NOUSE
916 ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
917 ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
918 ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
919 ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
921 #elif defined(NLIB_SSE41)
922 return _mm_cmplt_ps(a, b);
923 #elif defined(NLIB_NEON)
924 uint32x4_t tmp = vcltq_f32(a, b);
925 return vreinterpretq_f32_u32(tmp);
926 #elif defined(NLIB_CAFE_PPC)
930 ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
931 ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
937 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b) NLIB_NOEXCEPT {
938 #ifdef NLIB_F128_SIMD_NOUSE
940 ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
941 ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
942 ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
943 ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
945 #elif defined(NLIB_SSE41)
946 return _mm_cmple_ps(a, b);
947 #elif defined(NLIB_NEON)
948 uint32x4_t tmp = vcleq_f32(a, b);
949 return vreinterpretq_f32_u32(tmp);
950 #elif defined(NLIB_CAFE_PPC)
954 f32x2 one = __PS_FDUP(1.f);
955 f32x2 minus_one = __PS_NEG(one);
957 x0 = __PS_SUB(b.vec.ps[0], a.vec.ps[0]);
958 x1 = __PS_SUB(b.vec.ps[1], a.vec.ps[1]);
959 ret.vec.ps[0] = __PS_SEL(x0, minus_one, one);
960 ret.vec.ps[1] = __PS_SEL(x1, minus_one, one);
966 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b) NLIB_NOEXCEPT {
967 #ifdef NLIB_F128_SIMD_NOUSE
969 ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
970 ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
971 ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
972 ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
974 #elif defined(NLIB_SSE41)
975 return _mm_cmpgt_ps(a, b);
976 #elif defined(NLIB_NEON)
977 uint32x4_t tmp = vcgtq_f32(a, b);
978 return vreinterpretq_f32_u32(tmp);
979 #elif defined(NLIB_CAFE_PPC)
983 ret.vec.ps[0] = __PS_SUB(b.vec.ps[0], a.vec.ps[0]);
984 ret.vec.ps[1] = __PS_SUB(b.vec.ps[1], a.vec.ps[1]);
990 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b) NLIB_NOEXCEPT {
991 #ifdef NLIB_F128_SIMD_NOUSE
993 ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
994 ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
995 ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
996 ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
998 #elif defined(NLIB_SSE41)
999 return _mm_cmpge_ps(a, b);
1000 #elif defined(NLIB_NEON)
1001 uint32x4_t tmp = vcgeq_f32(a, b);
1002 return vreinterpretq_f32_u32(tmp);
1003 #elif defined(NLIB_CAFE_PPC)
1007 f32x2 one = __PS_FDUP(1.f);
1008 f32x2 minus_one = __PS_NEG(one);
1010 x0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1011 x1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1012 ret.vec.ps[0] = __PS_SEL(x0, minus_one, one);
1013 ret.vec.ps[1] = __PS_SEL(x1, minus_one, one);
1019 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b) NLIB_NOEXCEPT {
1020 #ifdef NLIB_F128_SIMD_NOUSE
1022 ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1023 ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1024 ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1025 ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1027 #elif defined(NLIB_SSE41)
1028 return _mm_cmpneq_ps(a, b);
1029 #elif defined(NLIB_NEON)
1030 uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1031 return vreinterpretq_f32_u32(tmp);
1032 #elif defined(NLIB_CAFE_PPC)
1038 ret.vec.ps[0] = __PS_MUL(__PS_SUB(a.vec.ps[0], b.vec.ps[0]),
1039 __PS_SUB(b.vec.ps[0], a.vec.ps[0]));
1040 ret.vec.ps[1] = __PS_MUL(__PS_SUB(a.vec.ps[1], b.vec.ps[1]),
1041 __PS_SUB(b.vec.ps[1], a.vec.ps[1]));
1047 NLIB_M(f128) F128::Add(f128arg a, f128arg b) NLIB_NOEXCEPT {
1048 #ifdef NLIB_F128_SIMD_NOUSE
1050 ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1051 ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1052 ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1053 ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1055 #elif defined(NLIB_SSE41)
1056 return _mm_add_ps(a, b);
1057 #elif defined(NLIB_NEON)
1058 return vaddq_f32(a, b);
1059 #elif defined(NLIB_CAFE_PPC)
1061 ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1062 ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1068 NLIB_M(f128) F128::Sub(f128arg a, f128arg b) NLIB_NOEXCEPT {
1069 #ifdef NLIB_F128_SIMD_NOUSE
1071 ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1072 ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1073 ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1074 ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1076 #elif defined(NLIB_SSE41)
1077 return _mm_sub_ps(a, b);
1078 #elif defined(NLIB_NEON)
1079 return vsubq_f32(a, b);
1080 #elif defined(NLIB_CAFE_PPC)
1082 ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1083 ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1089 NLIB_M(f128) F128::Negate(f128arg value) NLIB_NOEXCEPT {
1090 #ifdef NLIB_F128_SIMD_NOUSE
1092 ret.vec.v[0] = -value.vec.v[0];
1093 ret.vec.v[1] = -value.vec.v[1];
1094 ret.vec.v[2] = -value.vec.v[2];
1095 ret.vec.v[3] = -value.vec.v[3];
1097 #elif defined(NLIB_NEON)
1098 return vnegq_f32(value);
1099 #elif defined(NLIB_SSE41)
1100 const __m128 signmask = _mm_set1_ps(-0.0f);
1101 return _mm_xor_ps(signmask, value);
1102 #elif defined(NLIB_CAFE_PPC)
1104 ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1105 ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1111 NLIB_M(f128) F128::Mult(f128arg a, f128arg b) NLIB_NOEXCEPT {
1112 #ifdef NLIB_F128_SIMD_NOUSE
1114 ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1115 ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1116 ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1117 ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1119 #elif defined(NLIB_SSE41)
1120 return _mm_mul_ps(a, b);
1121 #elif defined(NLIB_NEON)
1122 return vmulq_f32(a, b);
1123 #elif defined(NLIB_CAFE_PPC)
1125 ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1126 ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1132 NLIB_M(f128) F128::Mult(
float a, f128arg b) NLIB_NOEXCEPT {
1133 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1134 return vmulq_n_f32(b, a);
1135 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1137 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1138 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1141 return F128::Mult(b, F128::SetValue(a, each_float));
1147 NLIB_M(f128) F128::Mult(f128arg a, f128arg b, each_select32_tag) NLIB_NOEXCEPT {
1148 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1150 return vmulq_laneq_f32(b, a, N);
1152 float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1153 return vmulq_n_f32(b, tmp);
1155 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1156 float t = a.vec.ps[N / 2][N % 2];
1158 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1159 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1167 NLIB_M(f128) F128::Div(f128arg a, f128arg b) NLIB_NOEXCEPT {
1168 #ifdef NLIB_F128_SIMD_NOUSE
1170 ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1171 ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1172 ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1173 ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1175 #elif defined(NLIB_SSE41)
1176 return _mm_div_ps(a, b);
1177 #elif defined(NLIB_NEON)
1178 float32x4_t inv0 = vrecpeq_f32(b);
1179 float32x4_t step0 = vrecpsq_f32(inv0, b);
1180 float32x4_t inv1 = vmulq_f32(step0, inv0);
1181 float32x4_t step1 = vrecpsq_f32(inv1, b);
1182 float32x4_t inv2 = vmulq_f32(step1, inv1);
1183 uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1184 inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1185 return vmulq_f32(a, inv2);
1186 #elif defined(NLIB_CAFE_PPC)
1188 ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1189 ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1195 NLIB_M(f128) F128::Max(f128arg a, f128arg b) NLIB_NOEXCEPT {
1196 #ifdef NLIB_F128_SIMD_NOUSE
1198 ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1199 ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1200 ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1201 ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1203 #elif defined(NLIB_SSE41)
1204 return _mm_max_ps(a, b);
1205 #elif defined(NLIB_NEON)
1206 return vmaxq_f32(a, b);
1207 #elif defined(NLIB_CAFE_PPC)
1208 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1209 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1211 ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1212 ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1218 NLIB_M(f128) F128::Min(f128arg a, f128arg b) NLIB_NOEXCEPT {
1219 #ifdef NLIB_F128_SIMD_NOUSE
1221 ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1222 ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1223 ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1224 ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1226 #elif defined(NLIB_SSE41)
1227 return _mm_min_ps(a, b);
1228 #elif defined(NLIB_NEON)
1229 return vminq_f32(a, b);
1230 #elif defined(NLIB_CAFE_PPC)
1231 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1232 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1234 ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1235 ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1241 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b) NLIB_NOEXCEPT {
1242 #ifdef NLIB_F128_SIMD_NOUSE
1244 ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1245 ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1246 ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1247 ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1249 #elif defined(NLIB_SSE41)
1250 f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1251 f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1252 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1253 #elif defined(NLIB_NEON)
1255 return vpmaxq_f32(a, b);
1257 float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1258 float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1259 return vcombine_f32(rl, rh);
1261 #elif defined(NLIB_CAFE_PPC)
1262 f32x2 v02, v13, cmp;
1264 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1265 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1266 cmp = __PS_SUB(v02, v13);
1267 ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1268 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1269 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1270 cmp = __PS_SUB(v02, v13);
1271 ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1277 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b) NLIB_NOEXCEPT {
1278 #ifdef NLIB_F128_SIMD_NOUSE
1280 ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1281 ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1282 ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1283 ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1285 #elif defined(NLIB_SSE41)
1286 f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1287 f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1288 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1289 #elif defined(NLIB_NEON)
1291 return vpminq_f32(a, b);
1293 float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1294 float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1295 return vcombine_f32(rl, rh);
1297 #elif defined(NLIB_CAFE_PPC)
1298 f32x2 v02, v13, cmp;
1300 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1301 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1302 cmp = __PS_SUB(v02, v13);
1303 ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1304 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1305 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1306 cmp = __PS_SUB(v02, v13);
1307 ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1313 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b) NLIB_NOEXCEPT {
1314 #ifdef NLIB_F128_SIMD_NOUSE
1316 ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1317 ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1318 ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1319 ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1321 #elif defined(NLIB_SSE41)
1322 return _mm_hadd_ps(a, b);
1323 #elif defined(NLIB_NEON)
1325 return vpaddq_f32(a, b);
1327 float32x2_t al = vget_low_f32(a);
1328 float32x2_t ah = vget_high_f32(a);
1329 float32x2_t l = vpadd_f32(al, ah);
1331 float32x2_t bl = vget_low_f32(b);
1332 float32x2_t bh = vget_high_f32(b);
1333 float32x2_t h = vpadd_f32(bl, bh);
1334 return vcombine_f32(l, h);
1336 #elif defined(NLIB_CAFE_PPC)
1337 f32x2 v02, v13, cmp;
1339 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1340 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1341 ret.vec.ps[0] = __PS_ADD(v02, v13);
1342 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1343 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1344 ret.vec.ps[1] = __PS_ADD(v02, v13);
1350 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b) NLIB_NOEXCEPT {
1351 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1352 return vabdq_f32(a, b);
1354 return F128::Abs(F128::Sub(a, b));
1359 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1360 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1361 return vmlaq_f32(c, a, b);
1362 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1364 ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1365 ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1368 return F128::Add(c, F128::Mult(a, b));
1373 NLIB_M(f128) F128::MultAdd(
float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1374 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1375 return vmlaq_n_f32(c, b, a);
1377 return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1383 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1384 each_select32_tag) NLIB_NOEXCEPT {
1386 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1388 return vmlaq_laneq_f32(c, b, a, N);
1390 return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1393 return F128::MultAdd(F128::SetValue<N>(a,
each_select32), b, c);
1398 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1399 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1400 return vmlsq_f32(c, a, b);
1401 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
1403 ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1404 ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1407 return F128::Sub(c, F128::Mult(a, b));
1412 NLIB_M(f128) F128::MultSub(
float a, f128arg b, f128arg c) NLIB_NOEXCEPT {
1413 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1414 return vmlsq_n_f32(c, b, a);
1416 return F128::MultSub(F128::SetValue(a, each_float), b, c);
1422 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1423 each_select32_tag) NLIB_NOEXCEPT {
1425 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1427 return vmlsq_laneq_f32(c, b, a, N);
1429 return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1432 return F128::MultSub(F128::SetValue<N>(a,
each_select32), b, c);
1437 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t) NLIB_NOEXCEPT {
1439 return F128::MultAdd(t, F128::Sub(b, a), a);
1443 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1446 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1447 f128 zero = F128::SetZero();
1450 f128 sum = F128::Add(angle1, angle2);
1451 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1452 f128 ofs = F128::Select(cond, pi_dbl, zero);
1453 f128 result = F128::Add(sum, ofs);
1454 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1455 ofs = F128::Select(cond, pi_dbl, zero);
1456 return F128::Sub(result, ofs);
1460 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2) NLIB_NOEXCEPT {
1463 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1464 f128 zero = F128::SetZero();
1467 f128 sum = F128::Sub(angle1, angle2);
1468 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1469 f128 ofs = F128::Select(cond, pi_dbl, zero);
1470 f128 result = F128::Add(sum, ofs);
1471 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1472 ofs = F128::Select(cond, pi_dbl, zero);
1473 return F128::Sub(result, ofs);
1480 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1481 f128arg_ex t) NLIB_NOEXCEPT {
1487 f128 tt = F128::Mult(t, t);
1488 f128 ttt = F128::Mult(tt, t);
1489 ttt = F128::Mult(ttt, F128::LoadA16(hermite_R0_));
1490 ttt = F128::MultAdd(tt, F128::LoadA16(hermite_R1_), ttt);
1491 ttt = F128::MultAdd(t, F128::LoadA16(hermite_R2_), ttt);
1492 ttt = F128::Add(ttt, F128::LoadA16(hermite_R3_));
1506 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1507 f128arg_ex t) NLIB_NOEXCEPT {
1508 f128 tt = F128::Mult(t, t);
1509 f128 ttt = F128::Mult(tt, t);
1510 ttt = F128::Mult(ttt, F128::LoadA16(catmull_R0_));
1511 ttt = F128::MultAdd(tt, F128::LoadA16(catmull_R1_), ttt);
1512 ttt = F128::MultAdd(t, F128::LoadA16(catmull_R2_), ttt);
1513 ttt = F128::Add(ttt, F128::LoadA16(catmull_R3_));
1525 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1526 f128arg_ex g) NLIB_NOEXCEPT {
1527 f128 p1p0 = F128::Sub(p1, p0);
1528 f128 p2p0 = F128::Sub(p2, p0);
1529 f128 tmp = F128::MultAdd(f, p1p0, p0);
1530 return F128::MultAdd(g, p2p0, tmp);
1534 NLIB_M(f128) F128::And(f128arg a, f128arg b) NLIB_NOEXCEPT {
1535 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1537 ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1538 ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1539 ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1540 ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1542 #elif defined(NLIB_SSE41)
1543 return _mm_and_ps(a, b);
1544 #elif defined(NLIB_NEON)
1545 uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1546 return vreinterpretq_f32_u32(tmp);
1551 NLIB_M(f128) F128::Or(f128arg a, f128arg b) NLIB_NOEXCEPT {
1552 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1554 ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1555 ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1556 ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1557 ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1559 #elif defined(NLIB_SSE41)
1560 return _mm_or_ps(a, b);
1561 #elif defined(NLIB_NEON)
1562 uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1563 return vreinterpretq_f32_u32(tmp);
1568 NLIB_M(f128) F128::Xor(f128arg a, f128arg b) NLIB_NOEXCEPT {
1569 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1571 ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1572 ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1573 ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1574 ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1576 #elif defined(NLIB_SSE41)
1577 return _mm_xor_ps(a, b);
1578 #elif defined(NLIB_NEON)
1579 uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1580 return vreinterpretq_f32_u32(tmp);
1585 NLIB_M(f128) F128::Not(f128arg a) NLIB_NOEXCEPT {
1586 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1588 ret.vec.u[0] = ~a.vec.u[0];
1589 ret.vec.u[1] = ~a.vec.u[1];
1590 ret.vec.u[2] = ~a.vec.u[2];
1591 ret.vec.u[3] = ~a.vec.u[3];
1593 #elif defined(NLIB_SSE41)
1594 return _mm_andnot_ps(a, F128::CmpEq(a, a));
1595 #elif defined(NLIB_NEON)
1596 uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1597 return vreinterpretq_f32_u32(tmp);
1602 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1603 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1605 ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1606 ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1607 ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1608 ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1610 #elif defined(NLIB_SSE41)
1611 return _mm_andnot_ps(a, b);
1612 #elif defined(NLIB_NEON)
1613 uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1614 return vreinterpretq_f32_u32(tmp);
1619 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b) NLIB_NOEXCEPT {
1620 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1622 ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1623 ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1624 ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1625 ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1627 #elif defined(NLIB_SSE41)
1628 return _mm_or_ps(F128::Not(a), b);
1629 #elif defined(NLIB_NEON)
1630 uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1631 return vreinterpretq_f32_u32(tmp);
1636 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b) NLIB_NOEXCEPT {
1637 #ifdef NLIB_F128_SIMD_NOUSE
1639 ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1640 ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1641 ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1642 ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1644 #elif defined(NLIB_SSE41)
1645 return _mm_cmpeq_ps(a, b);
1646 #elif defined(NLIB_NEON)
1647 uint32x4_t tmp = vceqq_f32(a, b);
1648 return vreinterpretq_f32_u32(tmp);
1649 #elif defined(NLIB_CAFE_PPC)
1650 f32x2 x0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1651 f32x2 x1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1652 x0 = __PS_MUL(x0, __PS_NEG(x0));
1653 x1 = __PS_MUL(x1, __PS_NEG(x1));
1654 f32x2 one = __PS_FDUP(1.f);
1655 f32x2 minus_one = __PS_NEG(one);
1657 ret.vec.ps[0] = __PS_SEL(x0, minus_one, one);
1658 ret.vec.ps[1] = __PS_SEL(x1, minus_one, one);
1664 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps) NLIB_NOEXCEPT {
1665 f128 tmp = F128::AbsDiff(a, b);
1666 return F128::CmpLe(tmp, eps);
1670 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max) NLIB_NOEXCEPT {
1671 return F128::Min(max, F128::Max(min, value));
1675 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds) NLIB_NOEXCEPT {
1676 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
1677 uint32x4_t tmp = vcaleq_f32(value, bounds);
1678 return vreinterpretq_f32_u32(tmp);
1680 return F128::CmpLe(F128::Abs(value), bounds);
1685 NLIB_M2(f128) F128::Recp(f128arg value) NLIB_NOEXCEPT {
1686 #ifdef NLIB_F128_SIMD_NOUSE
1688 ret.vec.v[0] = 1.f / value.vec.v[0];
1689 ret.vec.v[1] = 1.f / value.vec.v[1];
1690 ret.vec.v[2] = 1.f / value.vec.v[2];
1691 ret.vec.v[3] = 1.f / value.vec.v[3];
1693 #elif defined(NLIB_SSE41)
1694 return _mm_div_ps(F128::SetOne(), value);
1695 #elif defined(NLIB_NEON)
1697 x = vrecpeq_f32(value);
1698 x = vmulq_f32(x, vrecpsq_f32(x, value));
1699 x = vmulq_f32(x, vrecpsq_f32(x, value));
1700 uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1701 float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1703 #elif defined(NLIB_CAFE_PPC)
1704 return F128::Div(F128::SetOne(), value);
1709 NLIB_M(f128) F128::RecpEst(f128arg value) NLIB_NOEXCEPT {
1710 #ifdef NLIB_F128_SIMD_NOUSE
1712 ret.vec.v[0] = 1.f / value.vec.v[0];
1713 ret.vec.v[1] = 1.f / value.vec.v[1];
1714 ret.vec.v[2] = 1.f / value.vec.v[2];
1715 ret.vec.v[3] = 1.f / value.vec.v[3];
1717 #elif defined(NLIB_SSE41)
1718 return _mm_rcp_ps(value);
1719 #elif defined(NLIB_NEON)
1720 return vrecpeq_f32(value);
1721 #elif defined(NLIB_CAFE_PPC)
1723 ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1724 ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1730 NLIB_M2(f128) F128::Sqrt(f128arg value) NLIB_NOEXCEPT {
1731 #ifdef NLIB_F128_SIMD_NOUSE
1733 ret.vec.v[0] = sqrtf(value.vec.v[0]);
1734 ret.vec.v[1] = sqrtf(value.vec.v[1]);
1735 ret.vec.v[2] = sqrtf(value.vec.v[2]);
1736 ret.vec.v[3] = sqrtf(value.vec.v[3]);
1738 #elif defined(NLIB_SSE41)
1739 return _mm_sqrt_ps(value);
1740 #elif defined(NLIB_NEON)
1741 f128 zero = F128::SetZero();
1742 f128 iszero = F128::CmpEq(zero, value);
1743 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1744 return F128::Select(iszero, zero, result);
1745 #elif defined(NLIB_CAFE_PPC)
1746 f128 zero = F128::SetZero();
1747 f128 iszero = F128::CmpEq(zero, value);
1748 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1749 return F128::Select(iszero, zero, result);
1754 NLIB_M(f128) F128::SqrtEst(f128arg value) NLIB_NOEXCEPT {
1755 #ifdef NLIB_F128_SIMD_NOUSE
1757 ret.vec.v[0] = sqrtf(value.vec.v[0]);
1758 ret.vec.v[1] = sqrtf(value.vec.v[1]);
1759 ret.vec.v[2] = sqrtf(value.vec.v[2]);
1760 ret.vec.v[3] = sqrtf(value.vec.v[3]);
1762 #elif defined(NLIB_SSE41)
1763 return _mm_sqrt_ps(value);
1764 #elif defined(NLIB_NEON)
1765 return vrecpeq_f32(vrsqrteq_f32(value));
1766 #elif defined(NLIB_CAFE_PPC)
1768 ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
1769 ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
1775 NLIB_M2(f128) F128::RecpSqrt(f128arg value) NLIB_NOEXCEPT {
1776 #ifdef NLIB_F128_SIMD_NOUSE
1778 ret.vec.v[0] = 1.f / sqrtf(value.vec.v[0]);
1779 ret.vec.v[1] = 1.f / sqrtf(value.vec.v[1]);
1780 ret.vec.v[2] = 1.f / sqrtf(value.vec.v[2]);
1781 ret.vec.v[3] = 1.f / sqrtf(value.vec.v[3]);
1783 #elif defined(NLIB_SSE41)
1784 return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
1785 #elif defined(NLIB_NEON)
1787 x = vrsqrteq_f32(value);
1788 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
1789 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
1790 uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1791 float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1793 #elif defined(NLIB_CAFE_PPC)
1794 f32x2 three = __PS_FDUP(3.f);
1795 f32x2 half = __PS_FDUP(0.5f);
1801 v = value.vec.ps[0];
1804 xx = __PS_MUL(x, x);
1805 xx = __PS_NMSUB(v, xx, three);
1806 xx = __PS_MUL(x, xx);
1807 x = __PS_MUL(half, xx);
1809 xx = __PS_MUL(x, x);
1810 xx = __PS_NMSUB(v, xx, three);
1811 xx = __PS_MUL(x, xx);
1812 ret.vec.ps[0] = __PS_MUL(half, xx);
1814 v = value.vec.ps[1];
1817 xx = __PS_MUL(x, x);
1818 xx = __PS_NMSUB(v, xx, three);
1819 xx = __PS_MUL(x, xx);
1820 x = __PS_MUL(half, xx);
1822 xx = __PS_MUL(x, x);
1823 xx = __PS_NMSUB(v, xx, three);
1824 xx = __PS_MUL(x, xx);
1825 ret.vec.ps[1] = __PS_MUL(half, xx);
1827 f128 iszero = F128::CmpEq(F128::SetZero(), value);
1828 f128 inf = F128::SetInfinity();
1829 return F128::Select(iszero, inf, ret);
1834 NLIB_M(f128) F128::RecpSqrtEst(f128arg value) NLIB_NOEXCEPT {
1835 #ifdef NLIB_F128_SIMD_NOUSE
1837 ret.vec.v[0] = 1.f / sqrtf(value.vec.v[0]);
1838 ret.vec.v[1] = 1.f / sqrtf(value.vec.v[1]);
1839 ret.vec.v[2] = 1.f / sqrtf(value.vec.v[2]);
1840 ret.vec.v[3] = 1.f / sqrtf(value.vec.v[3]);
1842 #elif defined(NLIB_SSE41)
1843 return _mm_rsqrt_ps(value);
1844 #elif defined(NLIB_NEON)
1845 return vrsqrteq_f32(value);
1846 #elif defined(NLIB_CAFE_PPC)
1848 ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
1849 ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
1854 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
1855 NLIB_M(f128) F128::NegateEx(f128arg value) NLIB_NOEXCEPT {
1856 const size_t lane0 = NegateLane0 ? 4 : 0;
1857 const size_t lane1 = NegateLane1 ? 5 : 1;
1858 const size_t lane2 = NegateLane2 ? 6 : 2;
1859 const size_t lane3 = NegateLane3 ? 7 : 3;
1860 return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
1864 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value) NLIB_NOEXCEPT {
1869 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value) NLIB_NOEXCEPT {
1870 return F128::Negate(value);
1873 #ifndef NLIB_F128_SIMD_NOUSE
1876 NLIB_M(f128) F128::NegateEx<true, false, true, false>(f128arg value) NLIB_NOEXCEPT {
1878 float32x4_t tmp = vnegq_f32(value);
1879 tmp = vcopyq_laneq_f32(tmp, 1, value, 1);
1880 tmp = vcopyq_laneq_f32(tmp, 3, value, 3);
1883 float32x2x2_t tmp = vtrn_f32(vget_low_f32(value), vget_high_f32(value));
1884 tmp.val[0] = vneg_f32(tmp.val[0]);
1885 tmp = vtrn_f32(tmp.val[0], tmp.val[1]);
1886 return vcombine_f32(tmp.val[0], tmp.val[1]);
1891 NLIB_M(f128) F128::NegateEx<false, true, false, true>(f128arg value) NLIB_NOEXCEPT {
1893 float32x4_t tmp = vnegq_f32(value);
1894 tmp = vcopyq_laneq_f32(tmp, 0, value, 0);
1895 tmp = vcopyq_laneq_f32(tmp, 2, value, 2);
1898 float32x2x2_t tmp = vtrn_f32(vget_low_f32(value), vget_high_f32(value));
1899 tmp.val[1] = vneg_f32(tmp.val[1]);
1900 tmp = vtrn_f32(tmp.val[0], tmp.val[1]);
1901 return vcombine_f32(tmp.val[0], tmp.val[1]);
1906 NLIB_M(f128) F128::NegateEx<false, false, true, true>(f128arg value) NLIB_NOEXCEPT {
1907 float32x2_t lo = vget_low_f32(value);
1908 float32x2_t hi = vneg_f32(vget_high_f32(value));
1909 return vcombine_f32(lo, hi);
1913 NLIB_M(f128) F128::NegateEx<true, true, false, false>(f128arg value) NLIB_NOEXCEPT {
1914 float32x2_t lo = vneg_f32(vget_low_f32(value));
1915 float32x2_t hi = vget_high_f32(value);
1916 return vcombine_f32(lo, hi);
1918 #elif defined(NLIB_SSE41)
1920 NLIB_M(f128) F128::NegateEx<true, false, true, false>(f128arg value) NLIB_NOEXCEPT {
1921 return _mm_addsub_ps(SetZero(), value);
1925 NLIB_M(f128) F128::NegateEx<false, true, false, true>(f128arg value) NLIB_NOEXCEPT {
1926 f128 zero = SetZero();
1927 return _mm_sub_ps(zero, _mm_addsub_ps(zero, value));
1932 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
1933 #define NLIB_ISNAN(vec, idx) \
1934 ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0)
1935 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U)
1939 NLIB_M2(f128) F128::IsNaN(f128arg value) NLIB_NOEXCEPT {
1940 #if defined(NLIB_F128_SIMD_NOUSE)
1942 ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
1943 ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
1944 ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
1945 ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
1947 #elif defined(NLIB_CAFE_PPC)
1949 f32x2 one = __PS_FDUP(1.f);
1950 f32x2 minus_one = __PS_NEG(one);
1951 f32x2 v0 = value.vec.ps[0];
1952 f32x2 v1 = value.vec.ps[1];
1953 f32x2 t0 = __PS_SEL(v0, one, minus_one);
1954 f32x2 t1 = __PS_SEL(v1, one, minus_one);
1956 f32x2 v0neg = __PS_NEG(v0);
1957 f32x2 v1neg = __PS_NEG(v1);
1958 ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
1959 ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
1962 return F128::CmpNe(value, value);
1967 NLIB_M(f128) F128::IsInfinite(f128arg value) NLIB_NOEXCEPT {
1968 #if defined(NLIB_F128_SIMD_NOUSE)
1970 ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
1971 ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
1972 ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
1973 ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
1975 #elif defined(NLIB_CAFE_PPC)
1977 f32x2 big_value = __PS_FDUP(FLT_MAX);
1978 ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
1979 ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
1982 f128 inf_value = F128::SetInfinity();
1983 f128 abs_value = F128::Abs(value);
1984 return F128::CmpEq(inf_value, abs_value);
1989 NLIB_M(f128) F128::Round(f128arg value) NLIB_NOEXCEPT {
1990 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
1991 return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1994 f128 sgn = F128::And(value, F128::SetSignMask());
1995 f128 sm = F128::Or(F128::SetValue(0x4B000000U,
each_uint32), sgn);
1996 f128 result = F128::Sub(F128::Add(value, sm), sm);
2002 NLIB_M2(f128) F128::Truncate(f128arg value) NLIB_NOEXCEPT {
2005 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2007 for (
size_t i = 0; i < 4; ++i) {
2008 if (NLIB_ISNAN(value.vec, i)) {
2009 ret.vec.u[i] = 0x7FC00000U;
2011 ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2012 ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2017 #elif defined(NLIB_SSE41)
2018 return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2020 f128 x = F128::Abs(value);
2021 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2022 f128 cond = F128::CmpLt(x, c_2_23);
2023 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2024 return F128::Select(cond, casted, value);
2029 NLIB_M2(f128) F128::Floor(f128arg value) NLIB_NOEXCEPT {
2030 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2032 ret.vec.v[0] = floorf(value.vec.v[0]);
2033 ret.vec.v[1] = floorf(value.vec.v[1]);
2034 ret.vec.v[2] = floorf(value.vec.v[2]);
2035 ret.vec.v[3] = floorf(value.vec.v[3]);
2037 #elif defined(NLIB_SSE41)
2038 return _mm_floor_ps(value);
2042 f128 x = F128::Abs(value);
2043 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2044 f128 cond = F128::CmpLt(x, c_2_23);
2045 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2048 f128 largeMask = F128::CmpGt(casted, value);
2050 casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(largeMask)));
2051 return F128::Select(cond, casted, value);
2056 NLIB_M2(f128) F128::Ceil(f128arg value) NLIB_NOEXCEPT {
2057 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2059 ret.vec.v[0] = ceilf(value.vec.v[0]);
2060 ret.vec.v[1] = ceilf(value.vec.v[1]);
2061 ret.vec.v[2] = ceilf(value.vec.v[2]);
2062 ret.vec.v[3] = ceilf(value.vec.v[3]);
2064 #elif defined(NLIB_SSE41)
2065 return _mm_ceil_ps(value);
2069 f128 x = F128::Abs(value);
2070 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2071 f128 cond = F128::CmpLt(x, c_2_23);
2072 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2075 f128 smallMask = F128::CmpLt(casted, value);
2077 casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(smallMask)));
2078 return F128::Select(cond, casted, value);
2082 #ifdef NLIB_F128_SIMD_NOUSE
2088 NLIB_M(f128) F128::Saturate(f128arg value) NLIB_NOEXCEPT {
2089 return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2092 NLIB_M2(f128) F128::ModAngle(f128arg value) NLIB_NOEXCEPT {
2093 static const float v_1_2pi = 0.15915494309189535f;
2094 static const float v_2pi = 6.283185307179586f;
2096 const f128 recpTwoPi = F128::SetValue(v_1_2pi, each_float);
2097 f128 round = F128::Round(F128::Mult(value, recpTwoPi));
2098 const f128 twoPi = F128::SetValue(v_2pi, each_float);
2099 return F128::MultSub(twoPi, round, value);
2102 NLIB_M2(f128) F128::Sin(f128arg value) NLIB_NOEXCEPT {
2104 f128 x = F128::ModAngle(value);
2111 f128 pi_ = F128::LoadA16(F128::pi_values_);
2115 f128 xabs = F128::Abs(value);
2116 f128 xsign = F128::And(F128::SetSignMask(), x);
2117 f128 mypi = F128::Or(xsign, pi);
2118 f128 pi_x = F128::Sub(mypi, x);
2119 f128 cond = F128::CmpLe(xabs, pidiv2);
2120 x = F128::Select(cond, x, pi_x);
2122 f128 xx = F128::Mult(x, x);
2123 f128 coeff = F128::LoadA16(sin_coeff_);
2127 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2128 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2129 result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[4], each_float));
2130 result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[5], each_float));
2131 result = F128::Mult(xx, result);
2132 result = F128::MultSub(result, x, x);
2136 NLIB_M2(f128) F128::Cos(f128arg value) NLIB_NOEXCEPT {
2138 f128 x = F128::ModAngle(value);
2145 f128 cvalue = F128::LoadA16(cos_cvalue_);
2147 f128 xabs = F128::Abs(value);
2148 f128 xsign = F128::And(F128::SetSignMask(), x);
2149 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2150 f128 pi_x = F128::Sub(mypi, x);
2151 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2152 x = F128::Select(cond, x, pi_x);
2155 f128 sign = F128::Select(cond, F128::SetValue<2>(cvalue,
each_select32),
2160 f128 xx = F128::Mult(x, x);
2161 f128 coeff = F128::LoadA16(cos_coeff_);
2165 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2166 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2167 result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[4], each_float));
2168 result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[5], each_float));
2169 result = F128::MultSub(xx, result, F128::SetOne());
2170 result = F128::Mult(sign, result);
2174 NLIB_M2(f128x2) F128::SinCos(f128arg value) NLIB_NOEXCEPT {
2176 f128 x = F128::ModAngle(value);
2183 f128 cvalue = F128::LoadA16(cos_cvalue_);
2185 f128 xabs = F128::Abs(value);
2186 f128 xsign = F128::And(F128::SetSignMask(), x);
2187 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2188 f128 pi_x = F128::Sub(mypi, x);
2189 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2190 x = F128::Select(cond, x, pi_x);
2193 f128 sign = F128::Select(cond, F128::SetValue<2>(cvalue,
each_select32),
2198 f128 xx = F128::Mult(x, x);
2203 f128 coeff = F128::LoadA16(cos_coeff_);
2208 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2209 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2210 result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[4], each_float));
2211 result = F128::MultSub(xx, result, F128::SetValue(cos_coeff_[5], each_float));
2212 result = F128::MultSub(xx, result, F128::SetOne());
2214 ret.val[1] = F128::Mult(sign, result);
2219 f128 coeff = F128::LoadA16(sin_coeff_);
2224 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2225 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2226 result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[4], each_float));
2227 result = F128::MultSub(xx, result, F128::SetValue(sin_coeff_[5], each_float));
2228 result = F128::Mult(xx, result);
2229 ret.val[0] = F128::MultSub(result, x, x);
2234 NLIB_M2(f128) F128::ArcTan(f128arg value) NLIB_NOEXCEPT {
2238 f128 cmp, value_sign;
2240 f128 one = F128::SetOne();
2241 f128 negative_one = F128::SetNegativeOne();
2246 value_sign = F128::Select(F128::CmpGt(value, one), one, negative_one);
2247 cmp = F128::CmpLe(F128::Abs(value), one);
2249 f128 x = F128::Select(cmp, value, F128::Recp(value));
2256 f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2257 f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2258 f128 xx = F128::Mult(x, x);
2261 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1,
each_select32));
2262 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1,
each_select32));
2263 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0,
each_select32));
2264 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0,
each_select32));
2265 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0,
each_select32));
2266 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0,
each_select32));
2268 result = F128::Mult(result, x);
2269 result = F128::MultSub(xx, result, x);
2271 f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2272 f128 result_another = F128::Sub(F128::Mult(value_sign, pi_2), result);
2273 result = F128::Select(cmp, result, result_another);
2277 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x) NLIB_NOEXCEPT {
2294 const f128 signmask = F128::SetSignMask();
2296 const f128 sy = F128::And(y, signmask);
2297 const f128 infx = F128::IsInfinite(x);
2298 const f128 infy = F128::IsInfinite(y);
2299 f128 zero = F128::SetZero();
2300 const f128 zerox = F128::CmpEq(x, zero);
2301 const f128 zeroy = F128::CmpEq(y, zero);
2302 const f128 posx = F128::CmpGt(x, zero);
2312 zero = F128::Or(zero, sy);
2313 const f128 cval = F128::LoadA16(atan2_cvalue_);
2314 const f128 pi = F128::Or(zero, F128::SetValue<0>(cval,
each_select32));
2315 const f128 pi_34 = F128::Or(zero, F128::SetValue<1>(cval,
each_select32));
2316 const f128 pi_2 = F128::Or(zero, F128::SetValue<2>(cval,
each_select32));
2317 const f128 pi_4 = F128::Or(zero, F128::SetValue<3>(cval,
each_select32));
2318 #if defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
2319 const f128 full = F128::SetValue(0xFF7FFFFFUL,
each_uint32);
2321 const f128 full = F128::CmpEq(zero, zero);
2324 f128 v = F128::Select(
2325 infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2326 F128::Select(zeroy, F128::Select(posx, zero, pi), F128::Select(zerox, pi_2, full)));
2331 #if defined(NLIB_F128_SIMD_NOUSE)
2333 mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2334 mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2335 mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2336 mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2337 #elif defined(NLIB_CAFE_PPC)
2340 mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2341 mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2342 mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2343 mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2345 f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v), F128::CastToI128(full)));
2347 f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::Select(posx, zero, pi));
2348 return F128::Select(mask, result, v);
2351 NLIB_M2(f128) F128::ArcSin(f128arg value) NLIB_NOEXCEPT {
2353 f128 one = F128::SetOne();
2354 f128 zero = F128::SetZero();
2355 f128 tmp = F128::MultSub(value, value, one);
2356 f128 argx = F128::Sqrt(F128::Select(F128::CmpLt(tmp, zero), zero, tmp));
2357 return F128::ArcTan2(value, argx);
2360 NLIB_M2(f128) F128::ArcCos(f128arg value) NLIB_NOEXCEPT {
2362 f128 one = F128::SetOne();
2363 f128 zero = F128::SetZero();
2364 f128 tmp = F128::MultSub(value, value, one);
2365 f128 argx = F128::Sqrt(F128::Select(F128::CmpLt(tmp, zero), zero, tmp));
2366 return F128::ArcTan2(argx, value);
2370 NLIB_M2(
int) F128::MoveMask(f128arg value) NLIB_NOEXCEPT {
2371 #ifdef NLIB_F128_SIMD_NOUSE
2373 ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2374 ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2375 ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2376 ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2378 #elif defined(NLIB_SSE41)
2379 return static_cast<uint8_t
>(_mm_movemask_ps(value));
2380 #elif defined(NLIB_NEON)
2381 NLIB_ALIGNAS(16) const uint64_t powers_[2] = {0x0000000200000001U, 0x0000000800000004U};
2382 uint64x2_t powers = vld1q_u64(powers_);
2383 uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(value),
2384 vreinterpretq_u32_u64(powers));
2385 uint8x16_t mask = vreinterpretq_u8_u64(vpaddlq_u32(tmp));
2386 uint8_t result = vgetq_lane_u8(mask, 0) + vgetq_lane_u8(mask, 8);
2388 #elif defined(NLIB_CAFE_PPC)
2389 int tmp = (value.vec.u[0] >> 31);
2390 tmp |= (value.vec.u[1] >> 30) & 2;
2391 tmp |= (value.vec.u[2] >> 29) & 4;
2392 tmp |= (value.vec.u[3] >> 28) & 8;
2398 NLIB_M2(
bool) F128::IsAllMaskFalse(f128arg value) NLIB_NOEXCEPT {
2399 #ifdef NLIB_F128_SIMD_NOUSE
2400 return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2401 #elif defined(NLIB_SSE41)
2402 i128 casted = F128::CastToI128(value);
2403 return _mm_testz_si128(casted, casted) != 0;
2404 #elif defined(NLIB_NEON)
2406 uint32x4_t casted = vreinterpretq_u32_f32(value);
2407 casted = vpmaxq_u32(casted, casted);
2408 return vgetq_lane_u64(vreinterpretq_u64_u32(casted), 0) == 0;
2410 int32x4_t casted = vreinterpretq_s32_f32(value);
2411 int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2412 return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2414 #elif defined(NLIB_CAFE_PPC)
2415 uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2416 return (tmp & 0x80000000U) == 0;
2421 NLIB_M2(
bool) F128::IsAllMaskTrue(f128arg value) NLIB_NOEXCEPT {
2422 #ifdef NLIB_F128_SIMD_NOUSE
2423 return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2424 value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2425 #elif defined(NLIB_SSE41)
2426 i128 casted = F128::CastToI128(value);
2427 return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2428 #elif defined(NLIB_NEON)
2430 uint32x4_t casted = vreinterpretq_u32_f32(value);
2431 casted = vpminq_u32(casted, casted);
2432 return vgetq_lane_s64(vreinterpretq_s64_u32(casted), 0) == -1;
2434 int32x4_t casted = vreinterpretq_s32_f32(value);
2435 int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2436 return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2438 #elif defined(NLIB_CAFE_PPC)
2439 uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2440 return (tmp & 0x80000000U) != 0;
2446 NLIB_M(
float) F128::GetFloatFromLane(f128arg value) NLIB_NOEXCEPT {
2448 #ifdef NLIB_F128_SIMD_NOUSE
2449 return value.vec.v[N];
2450 #elif defined(NLIB_SSE41)
2452 _MM_EXTRACT_FLOAT(dest, value, N);
2454 #elif defined(NLIB_NEON)
2455 return vgetq_lane_f32(value, N);
2456 #elif defined(NLIB_CAFE_PPC)
2457 return value.vec.ps[N / 2][N % 2];
2463 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value) NLIB_NOEXCEPT {
2465 #ifdef NLIB_F128_SIMD_NOUSE
2466 return value.vec.u[N];
2467 #elif defined(NLIB_SSE41)
2468 return _mm_extract_ps(value, N);
2469 #elif defined(NLIB_NEON)
2470 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2471 return vgetq_lane_u32(tmp, N);
2472 #elif defined(NLIB_CAFE_PPC)
2473 return value.vec.u[N];
2478 NLIB_M2(
float) F128::GetFloatByIndex(f128arg value,
size_t idx) NLIB_NOEXCEPT {
2479 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2480 return value.vec.v[idx];
2481 #elif defined(NLIB_SSE41)
2485 _MM_EXTRACT_FLOAT(dest, value, 0);
2488 _MM_EXTRACT_FLOAT(dest, value, 1);
2491 _MM_EXTRACT_FLOAT(dest, value, 2);
2494 _MM_EXTRACT_FLOAT(dest, value, 3);
2501 #elif defined(NLIB_NEON)
2504 return vgetq_lane_f32(value, 0);
2506 return vgetq_lane_f32(value, 1);
2508 return vgetq_lane_f32(value, 2);
2510 return vgetq_lane_f32(value, 3);
2519 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value,
size_t idx) NLIB_NOEXCEPT {
2520 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2521 return value.vec.u[idx];
2522 #elif defined(NLIB_SSE41)
2525 return static_cast<uint32_t
>(_mm_extract_ps(value, 0));
2527 return static_cast<uint32_t
>(_mm_extract_ps(value, 1));
2529 return static_cast<uint32_t
>(_mm_extract_ps(value, 2));
2531 return static_cast<uint32_t
>(_mm_extract_ps(value, 3));
2536 #elif defined(NLIB_NEON)
2537 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2540 return vgetq_lane_u32(tmp, 0);
2542 return vgetq_lane_u32(tmp, 1);
2544 return vgetq_lane_u32(tmp, 2);
2546 return vgetq_lane_u32(tmp, 3);
2556 NLIB_M(f128) F128::SetFloatToLane(f128arg value,
float v) NLIB_NOEXCEPT {
2558 #ifdef NLIB_F128_SIMD_NOUSE
2562 #elif defined(NLIB_SSE41)
2563 f128 tmp = _mm_set_ss(v);
2564 return _mm_insert_ps(value, tmp, N << 4);
2565 #elif defined(NLIB_NEON)
2566 return vsetq_lane_f32(v, value, N);
2567 #elif defined(NLIB_CAFE_PPC)
2569 ret.vec.ps[N / 2][N % 2] = v;
2575 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value,
float v,
size_t i) NLIB_NOEXCEPT {
2576 #ifdef NLIB_F128_SIMD_NOUSE
2580 #elif defined(NLIB_SSE41)
2581 f128 tmp = _mm_set_ss(v);
2584 return _mm_insert_ps(value, tmp, 0x00);
2586 return _mm_insert_ps(value, tmp, 0x10);
2588 return _mm_insert_ps(value, tmp, 0x20);
2590 return _mm_insert_ps(value, tmp, 0x30);
2595 #elif defined(NLIB_NEON)
2598 return vsetq_lane_f32(v, value, 0);
2600 return vsetq_lane_f32(v, value, 1);
2602 return vsetq_lane_f32(v, value, 2);
2604 return vsetq_lane_f32(v, value, 3);
2609 #elif defined(NLIB_CAFE_PPC)
2613 ret.vec.ps[0][0] = v;
2616 ret.vec.ps[0][1] = v;
2619 ret.vec.ps[1][0] = v;
2622 ret.vec.ps[1][1] = v;
2629 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
2632 template <
bool IsHighA,
bool IsHighB>
2633 float32x2_t F64Merge(float32x2_t a, float32x2_t b)
NLIB_NOEXCEPT;
2636 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2638 return vtrn1_f32(a, b);
2640 return vtrn_f32(a, b).val[0];
2645 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2647 return vtrn1_f32(vrev64_f32(a), b);
2649 return vtrn_f32(vrev64_f32(a), b).val[0];
2654 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2656 return vtrn1_f32(a, vrev64_f32(b));
2658 return vtrn_f32(a, vrev64_f32(b)).val[0];
2663 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2665 return vtrn2_f32(a, b);
2667 return vtrn_f32(a, b).val[1];
2676 return vget_low_f32(value);
2681 return vget_high_f32(value);
2684 template <
size_t X0,
size_t X1>
2685 struct F128SwizzleHelper2 {
2687 float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2688 float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2689 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2694 struct F128SwizzleHelper2<X, X> {
2696 float32x2_t x = F128SwizzleGet64<X / 2>(value);
2697 return vdup_lane_f32(x, (X & 1));
2702 struct F128SwizzleHelper2<0, 1> {
2704 return vget_low_f32(value);
2709 struct F128SwizzleHelper2<0, 2> {
2712 return vget_low_f32(vuzp1q_f32(value, value));
2714 float32x2_t lo = vget_low_f32(value);
2715 float32x2_t hi = vget_high_f32(value);
2716 return vzip_f32(lo, hi).val[0];
2722 struct F128SwizzleHelper2<0, 3> {
2724 float32x2_t lo = vget_low_f32(value);
2725 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2727 return vzip1_f32(lo, hi);
2729 return vzip_f32(lo, hi).val[0];
2735 struct F128SwizzleHelper2<1, 0> {
2737 return vrev64_f32(vget_low_f32(value));
2742 struct F128SwizzleHelper2<1, 2> {
2744 float32x2_t lo = vget_low_f32(value);
2745 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2747 return vzip2_f32(lo, hi);
2749 return vzip_f32(lo, hi).val[1];
2755 struct F128SwizzleHelper2<1, 3> {
2758 return vget_low_f32(vuzp2q_f32(value, value));
2760 float32x2_t lo = vget_low_f32(value);
2761 float32x2_t hi = vget_high_f32(value);
2762 return vzip_f32(lo, hi).val[1];
2768 struct F128SwizzleHelper2<2, 0> {
2771 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2773 float32x2_t lo = vget_low_f32(value);
2774 float32x2_t hi = vget_high_f32(value);
2775 return vzip_f32(hi, lo).val[0];
2781 struct F128SwizzleHelper2<2, 1> {
2784 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2786 float32x2_t lo = vget_low_f32(value);
2787 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2788 return vzip_f32(hi, lo).val[1];
2794 struct F128SwizzleHelper2<2, 3> {
2796 return vget_high_f32(value);
2801 struct F128SwizzleHelper2<3, 0> {
2803 float32x2_t lo = vget_low_f32(value);
2804 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2806 return vzip1_f32(hi, lo);
2808 return vzip_f32(hi, lo).val[0];
2814 struct F128SwizzleHelper2<3, 1> {
2816 float32x2_t lo = vget_low_f32(value);
2817 float32x2_t hi = vget_high_f32(value);
2819 return vzip2_f32(hi, lo);
2821 return vzip_f32(hi, lo).val[1];
2827 struct F128SwizzleHelper2<3, 2> {
2829 return vrev64_f32(vget_high_f32(value));
2833 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
2834 struct F128SwizzleHelper {
2836 return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
2837 detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
2841 template <
size_t Vx,
size_t Vy>
2842 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
2844 float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
2845 return vcombine_f32(tmp, tmp);
2850 struct F128SwizzleHelper<V, V, V, V> {
2857 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
2860 template <
size_t X0,
size_t X1>
2861 struct F128SwizzleHelper {
2866 struct F128SwizzleHelper<0, 0> {
2869 return __PS_MERGE00(v0, v0);
2874 struct F128SwizzleHelper<0, 1> {
2882 struct F128SwizzleHelper<0, 2> {
2884 return __PS_MERGE00(v0, v1);
2889 struct F128SwizzleHelper<0, 3> {
2891 return __PS_MERGE01(v0, v1);
2896 struct F128SwizzleHelper<1, 0> {
2899 return __PS_MERGE10(v0, v0);
2904 struct F128SwizzleHelper<1, 1> {
2907 return __PS_MERGE11(v0, v0);
2912 struct F128SwizzleHelper<1, 2> {
2914 return __PS_MERGE10(v0, v1);
2919 struct F128SwizzleHelper<1, 3> {
2921 return __PS_MERGE11(v0, v1);
2926 struct F128SwizzleHelper<2, 0> {
2928 return __PS_MERGE00(v1, v0);
2933 struct F128SwizzleHelper<2, 1> {
2935 return __PS_MERGE01(v1, v0);
2940 struct F128SwizzleHelper<2, 2> {
2943 return __PS_MERGE00(v1, v1);
2948 struct F128SwizzleHelper<2, 3> {
2956 struct F128SwizzleHelper<3, 0> {
2958 return __PS_MERGE10(v1, v0);
2963 struct F128SwizzleHelper<3, 1> {
2965 return __PS_MERGE11(v1, v0);
2970 struct F128SwizzleHelper<3, 2> {
2973 return __PS_MERGE10(v1, v1);
2978 struct F128SwizzleHelper<3, 3> {
2981 return __PS_MERGE11(v1, v1);
2988 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
2990 NLIB_M(f128) F128::Swizzle(f128arg value) NLIB_NOEXCEPT {
2995 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
2997 ret.vec.v[0] = value.vec.v[V0];
2998 ret.vec.v[1] = value.vec.v[V1];
2999 ret.vec.v[2] = value.vec.v[V2];
3000 ret.vec.v[3] = value.vec.v[V3];
3002 #elif defined(NLIB_SSE41)
3003 return _mm_shuffle_ps(value, value, _MM_SHUFFLE(V3, V2, V1, V0));
3004 #elif defined(NLIB_NEON)
3005 return detail::F128SwizzleHelper<V0, V1, V2, V3>::Swizzle(value);
3006 #elif defined(NLIB_CAFE_PPC)
3008 ret.vec.ps[0] = detail::F128::SwizzleHelper<V0, V1>(value.vec.ps[0], value.vec.ps[1]);
3009 ret.vec.ps[1] = detail::F128::SwizzleHelper<V2, V3>(value.vec.ps[0], value.vec.ps[1]);
3014 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3017 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value) NLIB_NOEXCEPT {
3019 return vzip1q_f32(value, value);
3021 return vzipq_f32(value, value).val[0];
3025 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value) NLIB_NOEXCEPT {
3027 return vtrn1q_f32(value, value);
3029 return vtrnq_f32(value, value).val[0];
3033 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value) NLIB_NOEXCEPT {
3037 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value) NLIB_NOEXCEPT {
3039 return vuzp1q_f32(value, value);
3041 return vuzpq_f32(value, value).val[0];
3045 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value) NLIB_NOEXCEPT {
3046 return vrev64q_f32(value);
3049 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3051 return vtrn2q_f32(value, value);
3053 return vtrnq_f32(value, value).val[1];
3057 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value) NLIB_NOEXCEPT {
3058 uint32x4_t ival = vreinterpretq_u32_f32(value);
3059 uint32x4_t rotated = vextq_u32(ival, ival, 1);
3060 return vreinterpretq_f32_u32(rotated);
3063 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value) NLIB_NOEXCEPT {
3065 return vuzp2q_f32(value, value);
3067 return vuzpq_f32(value, value).val[1];
3071 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value) NLIB_NOEXCEPT {
3073 return vzip2q_f32(value, value);
3075 return vzipq_f32(value, value).val[1];
3079 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value) NLIB_NOEXCEPT {
3080 uint32x4_t ival = vreinterpretq_u32_f32(value);
3081 uint32x4_t rotated = vextq_u32(ival, ival, 2);
3082 return vreinterpretq_f32_u32(rotated);
3085 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value) NLIB_NOEXCEPT {
3086 uint32x4_t ival = vreinterpretq_u32_f32(value);
3087 uint32x4_t rotated = vextq_u32(ival, ival, 3);
3088 return vreinterpretq_f32_u32(rotated);
3094 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3095 template <
bool UseBlend,
bool UseShuffle,
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3096 struct F128PermuteHelper2 {
3097 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3098 f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3099 f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3100 return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3101 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3105 template <
bool UseShuffle,
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3106 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3107 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3108 return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3109 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3113 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3114 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3115 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3116 return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3117 _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3122 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3123 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3124 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3125 return _mm_castsi128_ps(tmp);
3130 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3131 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3132 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3133 return _mm_castsi128_ps(tmp);
3138 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3139 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3140 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3141 return _mm_castsi128_ps(tmp);
3146 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3147 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3149 return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3154 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3155 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3157 return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3162 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3163 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3165 return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3170 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3171 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3173 return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3178 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3179 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3181 return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3186 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3187 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3189 return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3194 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3195 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3197 return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3202 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3203 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3205 return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3209 template <
bool IsAllA,
bool IsAllB,
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3210 struct F128PermuteHelper {
3211 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3212 return F128PermuteHelper2<
3213 ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3214 ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3215 V0, V1, V2, V3>::Permute(a, b);
3219 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
3222 float32x2_t F128PermuteGet64(f128arg a, f128arg b)
NLIB_NOEXCEPT;
3225 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3227 return vget_low_f32(a);
3230 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3232 return vget_high_f32(a);
3235 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3237 return vget_low_f32(b);
3240 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3242 return vget_high_f32(b);
3245 template <
size_t X0,
size_t X1>
3246 struct F128PermuteHelper2 {
3248 float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3249 float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3250 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3255 struct F128PermuteHelper2<X, X> {
3257 float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3258 return vdup_lane_f32(x, (X & 1));
3262 template <
bool IsAllA,
bool IsAllB,
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3263 struct F128PermuteHelper {
3264 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3265 return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3266 F128PermuteHelper2<V2, V3>::Permute(a, b));
3271 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3272 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3273 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3274 return vreinterpretq_f32_s32(tmp);
3279 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3280 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3281 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3282 return vreinterpretq_f32_s32(tmp);
3287 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3288 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3289 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3290 return vreinterpretq_f32_s32(tmp);
3293 #elif defined(NLIB_CAFE_PPC) && !defined(NLIB_F128_SIMD_NOUSE)
3294 template<
size_t R0,
size_t R1,
size_t VAR0,
size_t VAR1>
3295 struct F128PermuteHelper2 {
3296 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT;
3299 template<
size_t R0,
size_t R1>
3300 struct F128PermuteHelper2<R0, R1, 0, 0> {
3301 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3302 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3306 template<
size_t R0,
size_t R1>
3307 struct F128PermuteHelper2<R0, R1, 0, 1> {
3308 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3309 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3313 template<
size_t R0,
size_t R1>
3314 struct F128PermuteHelper2<R0, R1, 0, 2> {
3315 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3316 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3320 template<
size_t R0,
size_t R1>
3321 struct F128PermuteHelper2<R0, R1, 0, 3> {
3322 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3323 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3327 template<
size_t R0,
size_t R1>
3328 struct F128PermuteHelper2<R0, R1, 1, 0> {
3329 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3330 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3334 template<
size_t R0,
size_t R1>
3335 struct F128PermuteHelper2<R0, R1, 1, 1> {
3336 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3337 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3341 template<
size_t R0,
size_t R1>
3342 struct F128PermuteHelper2<R0, R1, 1, 2> {
3343 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3344 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3348 template<
size_t R0,
size_t R1>
3349 struct F128PermuteHelper2<R0, R1, 1, 3> {
3350 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3351 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3355 template<
size_t R0,
size_t R1>
3356 struct F128PermuteHelper2<R0, R1, 2, 0> {
3357 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3358 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3362 template<
size_t R0,
size_t R1>
3363 struct F128PermuteHelper2<R0, R1, 2, 1> {
3364 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3365 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3369 template<
size_t R0,
size_t R1>
3370 struct F128PermuteHelper2<R0, R1, 2, 2> {
3371 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3372 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3376 template<
size_t R0,
size_t R1>
3377 struct F128PermuteHelper2<R0, R1, 2, 3> {
3378 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3379 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3383 template<
size_t R0,
size_t R1>
3384 struct F128PermuteHelper2<R0, R1, 3, 0> {
3385 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3386 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3390 template<
size_t R0,
size_t R1>
3391 struct F128PermuteHelper2<R0, R1, 3, 1> {
3392 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3393 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3397 template<
size_t R0,
size_t R1>
3398 struct F128PermuteHelper2<R0, R1, 3, 2> {
3399 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3400 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3404 template<
size_t R0,
size_t R1>
3405 struct F128PermuteHelper2<R0, R1, 3, 3> {
3406 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3407 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3411 template <
bool IsAllA,
bool IsAllB,
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3412 struct F128PermuteHelper {
3413 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3415 f32x2 x0 = a.vec.ps[0];
3416 f32x2 x1 = a.vec.ps[1];
3417 f32x2 x2 = b.vec.ps[0];
3418 f32x2 x3 = b.vec.ps[1];
3419 ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3420 ::Permute(x0, x1, x2, x3);
3421 ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3422 ::Permute(x0, x1, x2, x3);
3427 template <
bool IsAllA,
bool IsAllB,
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3428 struct F128PermuteHelper {
3429 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3430 f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3431 F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3432 F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3433 F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3439 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3440 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3441 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3443 return F128::Swizzle<V0, V1, V2, V3>(a);
3447 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3448 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3449 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3451 return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3455 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE)
3458 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3459 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3460 return _mm_unpacklo_ps(a, b);
3464 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3465 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3466 return _mm_unpacklo_ps(b, a);
3470 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3471 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3472 return _mm_unpackhi_ps(a, b);
3476 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3477 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3478 return _mm_unpackhi_ps(b, a);
3483 template<
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3484 struct F128PermuteDontCareHelper {
3485 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3490 static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3491 static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3492 return detail::F128PermuteHelper< arg1, arg2,
3493 V0, V1, V2, V3 >::Permute(a, b);
3497 template<
size_t V1,
size_t V2,
size_t V3>
3498 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3499 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3503 static const size_t V0 = (V1 & 1) ? V1 - 1 : V1;
3504 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3508 template<
size_t V0,
size_t V2,
size_t V3>
3509 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3510 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3514 static const size_t V1 = (V0 & 1) ? V0 : (V0 + 1);
3515 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3519 template<
size_t V0,
size_t V1,
size_t V3>
3520 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3521 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3525 static const size_t V2 = (V3 & 1) ? V3 - 1 : V3;
3526 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3530 template<
size_t V0,
size_t V1,
size_t V2>
3531 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3532 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3536 static const size_t V3 = (V2 & 1) ? V2 : (V2 + 1);
3537 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3541 template<
size_t V2,
size_t V3>
3542 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3543 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3546 static const size_t V0 = (V2 < 4) ? 0 : 4;
3547 return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3551 template<
size_t V1,
size_t V2>
3552 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3553 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3556 static const size_t V0 = (V1 & 1) ? V1 - 1: V1;
3557 static const size_t V3 = (V2 & 1) ? V2 : V2 + 1;
3558 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3562 template<
size_t V0,
size_t V1>
3563 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3564 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3567 static const size_t V2 = (V1 < 4) ? 2 : 6;
3568 return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3572 template<
size_t V0,
size_t V3>
3573 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3574 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3577 static const size_t V1 = (V0 & 1) ? V0 : V0 + 1;
3578 static const size_t V2 = (V3 & 1) ? V3 - 1 : V3;
3579 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3583 template<
size_t V0,
size_t V2>
3584 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3585 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3588 static const size_t V1 = (V0 & 1) ? V0 : V0 + 1;
3589 static const size_t V3 = (V2 & 1) ? V2 : V2 + 1;
3590 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3594 template<
size_t V1,
size_t V3>
3595 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3596 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3599 static const size_t V0 = (V1 & 1) ? V1 - 1 : V1;
3600 static const size_t V2 = (V3 & 1) ? V3 - 1 : V3;
3601 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3606 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3607 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3609 static const size_t V1 = ((V & 3) == 0) ? V + 1 : V;
3610 static const size_t V2 = ((V & 3) == 0) ? V + 2 : V;
3611 static const size_t V3 = ((V & 3) == 0) ? V + 3 : V;
3612 return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3617 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3618 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3620 static const size_t V0 = ((V & 3) == 1) ? V - 1 : V;
3621 static const size_t V2 = ((V & 3) == 1) ? V + 1 : V;
3622 static const size_t V3 = ((V & 3) == 1) ? V + 2 : V;
3623 return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3628 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3629 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3631 static const size_t V0 = ((V & 3) == 2) ? V - 2 : V;
3632 static const size_t V1 = ((V & 3) == 2) ? V - 1 : V;
3633 static const size_t V3 = ((V & 3) == 2) ? V + 2 : V;
3634 return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3639 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3640 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3642 static const size_t V0 = ((V & 3) == 3) ? V - 3 : V;
3643 static const size_t V1 = ((V & 3) == 3) ? V - 2 : V;
3644 static const size_t V2 = ((V & 3) == 3) ? V - 1 : V;
3645 return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3650 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3651 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3659 template <
size_t V0,
size_t V1,
size_t V2,
size_t V3>
3661 NLIB_M(f128) F128::Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3662 return detail::F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3665 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
3668 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat) NLIB_NOEXCEPT {
3669 #if defined(NLIB_NEON)
3670 const size_t v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3671 const size_t v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3672 const size_t v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3673 const size_t v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3676 const size_t v0 = SplatLane0 ? 4 : 0;
3677 const size_t v1 = SplatLane1 ? 5 : 1;
3678 const size_t v2 = SplatLane2 ? 6 : 2;
3679 const size_t v3 = SplatLane3 ? 7 : 3;
3681 return F128::Permute<v0, v1, v2, v3>(value, splat);
3684 NLIB_M2(f128) F128::Exp2(f128arg value) NLIB_NOEXCEPT {
3685 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
3687 ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3688 ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3689 ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3690 ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3693 i128 iround = F128::ConvertToI128Round(value);
3694 f128 fround = F128::ConvertFromI128(iround);
3695 f128 x = F128::Sub(value, fround);
3696 f128 xx = F128::Mult(x, x);
3698 f128 P = F128::LoadA16(F128::exp2_P_);
3699 f128 Q = F128::LoadA16(F128::exp2_Q_);
3707 px = F128::MultAdd(px, xx, F128::SetValue<2>(P,
each_select32));
3708 px = F128::Mult(x, px);
3714 qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q,
each_select32));
3716 x = F128::Div(px, F128::Sub(qx, px));
3720 iround = I128::Add32(iround, I128::SetValue(127,
each_int32));
3721 iround = I128::ShiftLeftLogical32(iround, 23);
3722 x = F128::Mult(x, F128::CastFromI128(iround));
3730 NLIB_M(f128) F128::ExpE(f128arg value) NLIB_NOEXCEPT {
3731 static const float log2e = 1.44269504088896340736f;
3732 return Exp2(F128::Mult(log2e, value));
3735 NLIB_M(f128) F128::SinH(f128arg value) NLIB_NOEXCEPT {
3736 static const float log2e = 1.44269504088896340736f;
3737 f128 negOne = F128::SetValue(-1.f, each_float);
3738 f128 v0 = F128::MultAdd(log2e, value, negOne);
3739 f128 v1 = F128::MultSub(log2e, value, negOne);
3742 return F128::Sub(e0, e1);
3745 NLIB_M(f128) F128::CosH(f128arg value) NLIB_NOEXCEPT {
3746 static const float log2e = 1.44269504088896340736f;
3747 f128 negOne = F128::SetValue(-1.f, each_float);
3748 f128 v0 = F128::MultAdd(log2e, value, negOne);
3749 f128 v1 = F128::MultSub(log2e, value, negOne);
3752 return F128::Add(e0, e1);
3755 NLIB_M(f128) F128::TanH(f128arg value) NLIB_NOEXCEPT {
3757 f128 cvalue = F128::LoadA16(tanh_cvalue_);
3761 e = F128::MultAdd(half, e, half);
3763 return F128::Sub(F128::SetValue<1>(cvalue,
each_select32), e);
3766 NLIB_M2(f128) F128::Tan(f128arg value) NLIB_NOEXCEPT {
3767 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
3769 ret.vec.v[0] = tanf(value.vec.v[0]);
3770 ret.vec.v[1] = tanf(value.vec.v[1]);
3771 ret.vec.v[2] = tanf(value.vec.v[2]);
3772 ret.vec.v[3] = tanf(value.vec.v[3]);
3776 f128 C = F128::LoadA16(&F128::tan_c_[0]);
3779 f128 g = F128::Round(F128::Mult<0>(C, value,
each_select32));
3782 i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U,
each_uint32));
3783 i128 cmp = I128::CmpEq32(t0, I128::SetZero());
3784 nearXAxis = F128::CastFromI128(cmp);
3791 f128 zero = F128::SetZero();
3792 f128 nearAxis = F128::CmpNearEq(f, zero, F128::SetValue<3>(C,
each_select32));
3794 f128 P = F128::LoadA16(&F128::tan_p_[0]);
3795 f128 Q = F128::LoadA16(&F128::tan_q_[0]);
3797 f128 ff = F128::Mult(f, f);
3801 p = F128::MultAdd(p, ff, F128::SetValue<0>(P,
each_select32));
3802 p = F128::MultAdd(p, ff, one);
3803 p = F128::Mult(f, p);
3806 q = F128::MultAdd(q, ff, F128::SetValue<1>(Q,
each_select32));
3807 q = F128::MultAdd(q, ff, F128::SetValue<0>(Q,
each_select32));
3808 q = F128::MultAdd(q, ff, one);
3810 p = F128::Select(nearAxis, f, p);
3811 q = F128::Select(nearAxis, one, q);
3813 f128 r0 = F128::Div(p, q);
3814 f128 r1 = F128::Negate(F128::Recp(r0));
3816 return F128::Select(nearXAxis, r0, r1);
3820 NLIB_M2(f128) F128::Log2(f128arg value) NLIB_NOEXCEPT {
3821 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_CAFE_PPC)
3822 static const float scale = 1.4426950408889634f;
3824 ret.vec.v[0] = logf(value.vec.v[0]);
3825 ret.vec.v[1] = logf(value.vec.v[1]);
3826 ret.vec.v[2] = logf(value.vec.v[2]);
3827 ret.vec.v[3] = logf(value.vec.v[3]);
3828 return F128::Mult(scale, ret);
3831 f128 x = F128::And(F128::SetValue(0x807FFFFFU,
each_uint32), value);
3832 x = F128::Or(F128::SetValue(127U << 23,
each_uint32), x);
3833 i128 e = I128::And(I128::SetValue(0x7F800000U,
each_uint32), F128::CastToI128(value));
3834 e = I128::ShiftRightLogical32(e, 23);
3835 e = I128::Sub32(e, I128::SetValue(127U,
each_uint32));
3837 x = F128::Sub(x, F128::SetOne());
3838 f128 z = F128::Mult(x, x);
3841 f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
3842 f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
3843 f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
3846 p = F128::MultAdd(p, x, F128::SetValue<1>(pq0,
each_select32));
3847 p = F128::MultAdd(p, x, F128::SetValue<2>(pq0,
each_select32));
3848 p = F128::MultAdd(p, x, F128::SetValue<3>(pq0,
each_select32));
3849 p = F128::MultAdd(p, x, F128::SetValue<0>(pq1,
each_select32));
3850 p = F128::MultAdd(p, x, F128::SetValue<1>(pq1,
each_select32));
3852 f128 q = F128::Add(x, F128::SetValue<2>(pq1,
each_select32));
3853 q = F128::MultAdd(q, x, F128::SetValue<3>(pq1,
each_select32));
3854 q = F128::MultAdd(q, x, F128::SetValue<0>(pq2,
each_select32));
3855 q = F128::MultAdd(q, x, F128::SetValue<1>(pq2,
each_select32));
3856 q = F128::MultAdd(q, x, F128::SetValue<2>(pq2,
each_select32));
3858 y = F128::Mult(z, p);
3859 y = F128::Div(y, q);
3860 y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
3866 result = F128::Mult(y, log2ea);
3867 result = F128::MultAdd(log2ea, x, result);
3868 result = F128::Add(result, y);
3869 result = F128::Add(result, x);
3870 result = F128::Add(result, F128::ConvertFromI128(e));
3874 f128 zero = F128::SetZero();
3875 f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
3878 f128 is_nan = F128::IsNaN(value);
3880 result = F128::Select(is_nan, nan, result);
3882 f128 is_inf = F128::IsInfinite(value);
3883 f128 is_pos = F128::CmpGt(value, zero);
3887 f128 is_pos_inf = F128::And(is_inf, is_pos);
3888 result = F128::Select(is_pos_inf, inf, result);
3892 f128 is_zero = F128::CmpEq(value, zero);
3893 result = F128::Select(is_zero, neg_inf, result);
3897 f128 is_neg = F128::CmpLt(value, zero);
3898 result = F128::Select(is_neg, neg_nan, result);
3907 NLIB_M(f128) F128::LogE(f128arg value) NLIB_NOEXCEPT {
3908 #ifdef NLIB_F128_SIMD_NOUSE
3910 ret.vec.v[0] = logf(value.vec.v[0]);
3911 ret.vec.v[1] = logf(value.vec.v[1]);
3912 ret.vec.v[2] = logf(value.vec.v[2]);
3913 ret.vec.v[3] = logf(value.vec.v[3]);
3916 f128 x = F128::Log2(value);
3917 static const float recp_log2e = 0.6931471805597018f;
3918 return F128::Mult(recp_log2e, x);
3924 #endif // NLIB_DOXYGEN
3935 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
3942 SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) NLIB_NOEXCEPT {
3948 SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
float m12,
3949 float m13,
float m20,
float m21,
float m22,
float m23,
float m30,
float m31,
3957 inline SimdMatrix::SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
3958 float m12,
float m13,
float m20,
float m21,
float m22,
float m23,
3959 float m30,
float m31,
float m32,
float m33) NLIB_NOEXCEPT {
3960 r[0] = F128::SetValue(m00, m01, m02, m03);
3961 r[1] = F128::SetValue(m10, m11, m12, m13);
3962 r[2] = F128::SetValue(m20, m21, m22, m23);
3963 r[3] = F128::SetValue(m30, m31, m32, m33);
3966 inline SimdMatrix::SimdMatrix(
const float* p) NLIB_NOEXCEPT {
3967 uintptr_t algn =
reinterpret_cast<uintptr_t
>(p) & 15;
3968 NLIB_ASSERT((algn & 3) == 0);
3969 switch (algn >> 2) {
3971 r[0] = F128::LoadA16(p);
3972 r[1] = F128::LoadA16(p + 4);
3973 r[2] = F128::LoadA16(p + 8);
3974 r[3] = F128::LoadA16(p + 12);
3977 r[0] = F128::LoadA4(p);
3978 r[1] = F128::LoadA4(p + 4);
3979 r[2] = F128::LoadA4(p + 8);
3980 r[3] = F128::LoadA4(p + 12);
3983 r[0] = F128::LoadA8(p);
3984 r[1] = F128::LoadA8(p + 4);
3985 r[2] = F128::LoadA8(p + 8);
3986 r[3] = F128::LoadA8(p + 12);
3994 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE)
4000 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE)
4001 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4003 f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \
4004 f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \
4005 f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \
4006 f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \
4007 row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \
4008 row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \
4009 row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \
4010 row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \
4012 #elif defined(NLIB_NEON)
4013 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4015 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \
4016 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \
4017 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \
4018 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \
4019 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \
4020 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \
4022 #elif defined(NLIB_CAFE_PPC)
4023 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \
4026 tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \
4027 tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \
4028 row0.vec.ps[0] = tmp0; \
4029 row1.vec.ps[0] = tmp1; \
4030 tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \
4031 tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \
4032 row2.vec.ps[1] = tmp0; \
4033 row3.vec.ps[1] = tmp1; \
4034 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4035 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4036 row0.vec.ps[1] = row2.vec.ps[0]; \
4037 row1.vec.ps[1] = row3.vec.ps[0]; \
4038 row2.vec.ps[0] = tmp0; \
4039 row3.vec.ps[0] = tmp1; \
4040 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \
4041 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \
4042 row0.vec.ps[1] = tmp0; \
4043 row1.vec.ps[1] = tmp1; \
4068 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4076 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4084 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR)
4095 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
float x
The x-coordinate of the 3D vector.
SimdMatrix()
Instantiates the object with default parameters (default constructor).
The class with the collection of functions that handle 4x4 matrices.
Class representing the view frustum.
The class with the collection of functions that handle quaternions.
f128arg SimdVectorArg
f128arg is defined using typedef.
The type for two SIMD registers for 128-bit, single-precision, floating-point numbers.
float x
The x-coordinate for the 4D vector.
float y
The y-coordinate of the 4D vector.
Implements the class and functions for SIMD computations on integers.
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
Sets up the matrix from the parameters.
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
f128arg SimdSphereArg
f128arg is defined using typedef.
Class for representing oriented bounding boxes (OBB). This class has data members to hold the center ...
static f128 ShiftRight(f128arg a, f128arg b) noexcept
Sets the elements of b in shifted order to the portion of a that becomes empty when a is shifted to t...
The tag for representing a single-precision floating-point number with an empty structure.
The class with the collection of functions that determine containment relations.
The class with the collection of static member functions that handle spheres in three-dimensional spa...
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
f128arg SimdQuaternionArg
f128arg is defined using typedef.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
The class with the collection of functions that handle planes in three-dimensional space...
f128arg SimdPlaneArg
f128arg is defined using typedef.
The class with the collection of functions that perform calculations on three-dimensional vectors...
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
static f128 RotateLeft(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the left by the amount of N...
The class with the collection of functions that perform square-of-distance calculations.
The type for reading and writing four-dimensional vectors in memory. Keeps float-type x...
const f128 f128arg
const f128 or const f128& is defined using typedef.
The structure for keeping a 4x4 matrix.
float z
The z-coordinate of the 4D vector.
nlib_f128x2_t f128x2
nlib_f128x2_t is defined using typedef.
f128 SimdSphere
f128 is defined using typedef. Used when handling spheres.
The class for single-precision floating point SIMD computations using128-bit registers (MM0-XMM15 for...
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
A file that contains the configuration information for each development environment.
The class with the collection of functions that perform calculations on four-dimensional vectors...
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
The tag for representing an unsigned 32-bit integer with an empty structure.
static f128 RotateRight(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the right by the amount of N...
float y
The y-coordinate of the 3D vector.
nlib_f128_t f128
nlib_f128_t is is defined using typedef.
float z
The z-coordinate of the 3D vector.
Class for representing axis-aligned bounding boxes (AABB). The class has data members to hold the min...
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
float w
The w-coordinate of the 4D vector.
The class with the collection of functions that determine intersections.
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
__m128 nlib_f128_t
The type for a SIMD register for 128-bit, single-precision, floating-point numbers.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...