16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 19 #ifdef NN_PLATFORM_CTR 20 # ifndef __USE_C99_MATH 21 # define __USE_C99_MATH 31 #define INFINITY ((float)(1e+300 * 1e+300)) 34 #if !defined(NLIB_SIMD) && !defined(CAFE) 35 #define NLIB_F128_SIMD_NOUSE 38 #ifdef NLIB_F128_SIMD_NOUSE 46 #elif defined(NLIB_SSE41) 49 #elif defined(NLIB_NEON) 75 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 81 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 82 typedef const f128& f128arg_ex;
84 typedef const f128 f128arg_ex;
87 #if !defined(_MSC_VER) || _MSC_VER < 1800 97 static f128 __vectorcall SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT;
106 static f128 __vectorcall SetZeroToLane(f128arg value)
NLIB_NOEXCEPT;
114 static f128 __vectorcall LoadA16(
const float* p)
NLIB_NOEXCEPT;
115 static f128 __vectorcall LoadA8(
const float* p)
NLIB_NOEXCEPT;
116 static f128 __vectorcall LoadA4(
const float* p)
NLIB_NOEXCEPT;
124 static void __vectorcall StoreA16(
float* p, f128arg value)
NLIB_NOEXCEPT;
125 static void __vectorcall StoreA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
126 static void __vectorcall StoreA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
127 static void __vectorcall StoreA16(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
128 static void __vectorcall StoreA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
129 static void __vectorcall StoreA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
130 static void __vectorcall StoreA16(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
131 static void __vectorcall StoreA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
132 static void __vectorcall StoreA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
134 static void __vectorcall StoreLoA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
135 static void __vectorcall StoreLoA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
136 static void __vectorcall StoreLoA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
137 static void __vectorcall StoreLoA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
138 static void __vectorcall StoreLoA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
139 static void __vectorcall StoreLoA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
141 static void __vectorcall StoreHiA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
142 static void __vectorcall StoreHiA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
143 static void __vectorcall StoreHiA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
144 static void __vectorcall StoreHiA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
145 static void __vectorcall StoreHiA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
146 static void __vectorcall StoreHiA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
154 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 163 static f128 __vectorcall ConvertFromFixedPoint(i128arg value)
NLIB_NOEXCEPT;
168 static f128 __vectorcall Add(f128arg a, f128arg b)
NLIB_NOEXCEPT;
169 static f128 __vectorcall Sub(f128arg a, f128arg b)
NLIB_NOEXCEPT;
170 static f128 __vectorcall Mult(f128arg a, f128arg b)
NLIB_NOEXCEPT;
171 static f128 __vectorcall Mult(
float a, f128arg b)
NLIB_NOEXCEPT;
174 static f128 __vectorcall Div(f128arg a, f128arg b)
NLIB_NOEXCEPT;
175 static f128 __vectorcall Negate(f128arg value)
NLIB_NOEXCEPT;
176 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
177 static f128 __vectorcall NegateEx(f128arg value)
NLIB_NOEXCEPT;
178 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
179 static f128 __vectorcall MultAdd(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
181 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
183 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
184 static f128 __vectorcall MultSub(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
186 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
188 static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b)
NLIB_NOEXCEPT;
190 static f128 __vectorcall AbsDiff(f128arg a, f128arg b)
NLIB_NOEXCEPT;
196 static f128 __vectorcall Max(f128arg a, f128arg b)
NLIB_NOEXCEPT;
197 static f128 __vectorcall Min(f128arg a, f128arg b)
NLIB_NOEXCEPT;
198 static f128 __vectorcall PairwiseMax(f128arg a, f128arg b)
NLIB_NOEXCEPT;
199 static f128 __vectorcall PairwiseMin(f128arg a, f128arg b)
NLIB_NOEXCEPT;
200 static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max)
NLIB_NOEXCEPT;
201 static f128 __vectorcall Saturate(f128arg value)
NLIB_NOEXCEPT;
208 static f128 __vectorcall RecpEst(f128arg value)
NLIB_NOEXCEPT;
210 static f128 __vectorcall SqrtEst(f128arg value)
NLIB_NOEXCEPT;
211 static f128 __vectorcall RecpSqrt(f128arg value)
NLIB_NOEXCEPT;
212 static f128 __vectorcall RecpSqrtEst(f128arg value)
NLIB_NOEXCEPT;
219 static f128 __vectorcall Truncate(f128arg value)
NLIB_NOEXCEPT;
227 static f128 __vectorcall And(f128arg a, f128arg b)
NLIB_NOEXCEPT;
228 static f128 __vectorcall Or(f128arg a, f128arg b)
NLIB_NOEXCEPT;
229 static f128 __vectorcall Xor(f128arg a, f128arg b)
NLIB_NOEXCEPT;
231 static f128 __vectorcall AndNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
232 static f128 __vectorcall OrNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
238 static f128 __vectorcall CmpEq(f128arg a, f128arg b)
NLIB_NOEXCEPT;
239 static f128 __vectorcall CmpLt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
240 static f128 __vectorcall CmpLe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
241 static f128 __vectorcall CmpGt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
242 static f128 __vectorcall CmpGe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
243 static f128 __vectorcall CmpNe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
244 static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps)
NLIB_NOEXCEPT;
245 static f128 __vectorcall InBound(f128arg value, f128arg bounds)
NLIB_NOEXCEPT;
247 static f128 __vectorcall CmpEqZero(f128arg value)
NLIB_NOEXCEPT;
248 static f128 __vectorcall CmpLtZero(f128arg value)
NLIB_NOEXCEPT;
249 static f128 __vectorcall CmpLeZero(f128arg value)
NLIB_NOEXCEPT;
250 static f128 __vectorcall CmpGtZero(f128arg value)
NLIB_NOEXCEPT;
251 static f128 __vectorcall CmpGeZero(f128arg value)
NLIB_NOEXCEPT;
252 static f128 __vectorcall CmpNeZero(f128arg value)
NLIB_NOEXCEPT;
253 static f128 __vectorcall CmpNearEqZero(f128arg value, f128arg eps)
NLIB_NOEXCEPT;
258 static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
259 static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
260 static f128 __vectorcall ModAngle(f128arg value)
NLIB_NOEXCEPT;
263 static f128x2 __vectorcall SinCos(f128arg value)
NLIB_NOEXCEPT;
268 static f128 __vectorcall ArcSin(f128arg value)
NLIB_NOEXCEPT;
269 static f128 __vectorcall ArcCos(f128arg value)
NLIB_NOEXCEPT;
270 static f128 __vectorcall ArcTan(f128arg value)
NLIB_NOEXCEPT;
271 static f128 __vectorcall ArcTan2(f128arg y, f128arg x)
NLIB_NOEXCEPT;
278 static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t)
NLIB_NOEXCEPT;
279 static f128 __vectorcall
280 Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t)
NLIB_NOEXCEPT;
281 static f128 __vectorcall
282 CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t)
NLIB_NOEXCEPT;
283 static f128 __vectorcall
284 BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g)
NLIB_NOEXCEPT;
298 static int __vectorcall MoveMask(f128arg value)
NLIB_NOEXCEPT;
299 static bool __vectorcall IsAllMaskFalse(f128arg value)
NLIB_NOEXCEPT;
300 static bool __vectorcall IsAllMaskTrue(f128arg value)
NLIB_NOEXCEPT;
301 static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b)
NLIB_NOEXCEPT;
303 static f128 __vectorcall IsInfinite(f128arg value)
NLIB_NOEXCEPT;
310 static float __vectorcall GetFloatFromLane(f128arg value)
NLIB_NOEXCEPT;
312 static uint32_t __vectorcall GetUint32FromLane(f128arg value)
NLIB_NOEXCEPT;
313 static float __vectorcall GetFloatByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
314 static uint32_t __vectorcall GetUint32ByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
317 static f128 __vectorcall SetFloatToLane(f128arg value,
float v)
NLIB_NOEXCEPT;
318 static f128 __vectorcall SetFloatByIndex(f128arg value,
float v,
size_t i)
NLIB_NOEXCEPT;
324 template <
int V0,
int V1,
int V2,
int V3>
325 static f128 __vectorcall Swizzle(f128arg value)
NLIB_NOEXCEPT;
326 template <
int V0,
int V1,
int V2,
int V3>
327 static f128 __vectorcall Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT;
328 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
329 static f128 __vectorcall Splat(f128arg value, f128arg splat)
NLIB_NOEXCEPT;
333 static f128 __vectorcall
RotateLeft(f128arg value) NLIB_NOEXCEPT {
335 const size_t NN = 4 - N;
336 return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
340 static f128 __vectorcall
RotateRight(f128arg value) NLIB_NOEXCEPT {
342 return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
346 static f128 __vectorcall
ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
348 return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
402 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 403 #define NLIB_M2(tp) inline tp __vectorcall 407 #ifdef NLIB_F128_SIMD_NOUSE 414 #elif defined(NLIB_SSE41) 415 return _mm_set1_ps(v);
416 #elif defined(NLIB_NEON) 417 return vdupq_n_f32(v);
420 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
427 #ifdef NLIB_F128_SIMD_NOUSE 434 #elif defined(NLIB_SSE41) 440 return _mm_set1_ps(tmp.f32);
441 #elif defined(NLIB_NEON) 442 uint32x4_t tmp = vdupq_n_u32(v);
443 return vreinterpretq_f32_u32(tmp);
451 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
457 NLIB_M(f128) F128::SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT {
458 #ifdef NLIB_F128_SIMD_NOUSE 465 #elif defined(NLIB_SSE41) 466 return _mm_set_ps(d, c, b, a);
467 #elif defined(NLIB_NEON) 476 return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
479 ret.vec.ps[0][0] = a;
480 ret.vec.ps[0][1] = b;
481 ret.vec.ps[1][0] = c;
482 ret.vec.ps[1][1] = d;
491 #ifdef NLIB_F128_SIMD_NOUSE 493 ret.vec.v[0] = value.vec.v[N];
494 ret.vec.v[1] = value.vec.v[N];
495 ret.vec.v[2] = value.vec.v[N];
496 ret.vec.v[3] = value.vec.v[N];
498 #elif defined(NLIB_SSE41) 499 return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
500 #elif defined(NLIB_NEON) 501 float32x2_t tmp = vget_low_f32(value);
502 return vdupq_lane_f32(tmp, N);
505 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
510 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 513 float32x2_t tmp = vget_high_f32(value);
514 return vdupq_lane_f32(tmp, 0);
518 float32x2_t tmp = vget_high_f32(value);
519 return vdupq_lane_f32(tmp, 1);
521 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 525 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
531 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
537 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
543 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
550 #ifdef NLIB_F128_SIMD_NOUSE 557 #elif defined(NLIB_SSE41) 558 return _mm_setzero_ps();
559 #elif defined(NLIB_NEON) 560 return vdupq_n_f32(0);
563 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
569 #ifdef NLIB_F128_SIMD_NOUSE 576 #elif defined(NLIB_NEON) 577 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
578 float32x2_t x00 = vcreate_f32(0ULL);
579 return vcombine_f32(x10, x00);
581 return F128::LoadA16(F128::v1000_);
586 #ifdef NLIB_F128_SIMD_NOUSE 593 #elif defined(NLIB_NEON) 594 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
595 float32x2_t x00 = vcreate_f32(0ULL);
596 return vcombine_f32(x01, x00);
598 return F128::LoadA16(F128::v0100_);
603 #ifdef NLIB_F128_SIMD_NOUSE 610 #elif defined(NLIB_NEON) 611 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
612 float32x2_t x00 = vcreate_f32(0ULL);
613 return vcombine_f32(x00, x10);
615 return F128::LoadA16(F128::v0010_);
620 #ifdef NLIB_F128_SIMD_NOUSE 627 #elif defined(NLIB_NEON) 628 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
629 float32x2_t x00 = vcreate_f32(0ULL);
630 return vcombine_f32(x00, x01);
632 return F128::LoadA16(F128::v0001_);
638 NLIB_M(f128) F128::SetZeroToLane(f128arg value)
NLIB_NOEXCEPT {
640 #ifdef NLIB_F128_SIMD_NOUSE 644 #elif defined(NLIB_SSE41) 645 return _mm_insert_ps(value, value, 1 << N);
646 #elif defined(NLIB_NEON) 647 return F128::Permute<N == 0 ? 4 : 0,
650 N == 3 ? 7 : 3>(value, vdupq_n_f32(0.f));
654 ret.vec.ps[N / 2][N % 2] = 0.f;
661 return F128::SetValue(1.f, each_float);
666 return F128::SetValue(-1.f, each_float);
671 return F128::SetValue(1.0e-7f, each_float);
686 return F128::SetValue(-0.f, each_float);
691 #ifdef NLIB_F128_SIMD_NOUSE 698 #elif defined(NLIB_SSE41) 699 return _mm_load_ps(p);
700 #elif defined(NLIB_NEON) 701 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
702 uint64x2_t val = vld1q_u64(tmp);
703 return vreinterpretq_f32_u64(val);
706 ret.vec.ps[0][0] = p[0];
707 ret.vec.ps[0][1] = p[1];
708 ret.vec.ps[1][0] = p[2];
709 ret.vec.ps[1][1] = p[3];
716 #ifdef NLIB_F128_SIMD_NOUSE 718 #elif defined(NLIB_SSE41) 719 return _mm_loadu_ps(p);
720 #elif defined(NLIB_NEON) 724 ret.vec.ps[0][0] = p[0];
725 ret.vec.ps[0][1] = p[1];
726 ret.vec.ps[1][0] = p[2];
727 ret.vec.ps[1][1] = p[3];
734 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 735 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
736 uint64x2_t val = vld1q_u64(tmp);
737 return vreinterpretq_f32_u64(val);
745 return LoadA16(reinterpret_cast<const float*>(p));
750 return LoadA8(reinterpret_cast<const float*>(p));
755 return LoadA4(reinterpret_cast<const float*>(p));
760 return LoadA16(reinterpret_cast<const float*>(p));
765 return LoadA8(reinterpret_cast<const float*>(p));
770 return LoadA4(reinterpret_cast<const float*>(p));
774 NLIB_M(
void) F128::StoreA16(
float* p, f128arg value)
NLIB_NOEXCEPT {
775 #ifdef NLIB_F128_SIMD_NOUSE 776 p[0] = value.vec.v[0];
777 p[1] = value.vec.v[1];
778 p[2] = value.vec.v[2];
779 p[3] = value.vec.v[3];
780 #elif defined(NLIB_SSE41) 781 _mm_store_ps(p, value);
782 #elif defined(NLIB_NEON) 783 uint64x2_t tmp = vreinterpretq_u64_f32(value);
784 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
786 p[0] = value.vec.ps[0][0];
787 p[1] = value.vec.ps[0][1];
788 p[2] = value.vec.ps[1][0];
789 p[3] = value.vec.ps[1][1];
794 NLIB_M(
void) F128::StoreA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
795 #ifdef NLIB_F128_SIMD_NOUSE 797 #elif defined(NLIB_SSE41) 798 _mm_storeu_ps(p, value);
799 #elif defined(NLIB_NEON) 802 p[0] = value.vec.ps[0][0];
803 p[1] = value.vec.ps[0][1];
804 p[2] = value.vec.ps[1][0];
805 p[3] = value.vec.ps[1][1];
810 NLIB_M(
void) F128::StoreA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
811 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 812 uint64x2_t tmp = vreinterpretq_u64_f32(value);
813 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
820 NLIB_M(
void) F128::StoreA16(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
821 StoreA16(reinterpret_cast<float*>(p), value);
825 NLIB_M(
void) F128::StoreA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
826 StoreA8(reinterpret_cast<float*>(p), value);
830 NLIB_M(
void) F128::StoreA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
831 StoreA4(reinterpret_cast<float*>(p), value);
835 NLIB_M(
void) F128::StoreA16(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
836 StoreA16(reinterpret_cast<float*>(p), value);
840 NLIB_M(
void) F128::StoreA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
841 StoreA8(reinterpret_cast<float*>(p), value);
845 NLIB_M(
void) F128::StoreA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
846 StoreA4(reinterpret_cast<float*>(p), value);
850 NLIB_M(
void) F128::StoreLoA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
851 #ifdef NLIB_F128_SIMD_NOUSE 852 p[0] = value.vec.v[0];
853 p[1] = value.vec.v[1];
854 #elif defined(NLIB_SSE41) 855 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
856 #elif defined(NLIB_NEON) 857 uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
858 vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
860 p[0] = value.vec.ps[0][0];
861 p[1] = value.vec.ps[0][1];
866 NLIB_M(
void) F128::StoreLoA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
867 #ifdef NLIB_F128_SIMD_NOUSE 868 p[0] = value.vec.v[0];
869 p[1] = value.vec.v[1];
870 #elif defined(NLIB_SSE41) 871 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
872 #elif defined(NLIB_NEON) 873 float32x2_t tmp = vget_low_f32(value);
876 p[0] = value.vec.ps[0][0];
877 p[1] = value.vec.ps[0][1];
882 NLIB_M(
void) F128::StoreLoA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
883 StoreLoA8(reinterpret_cast<float*>(p), value);
887 NLIB_M(
void) F128::StoreLoA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
888 StoreLoA4(reinterpret_cast<float*>(p), value);
892 NLIB_M(
void) F128::StoreLoA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
893 StoreLoA8(reinterpret_cast<float*>(p), value);
897 NLIB_M(
void) F128::StoreLoA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
898 StoreLoA4(reinterpret_cast<float*>(p), value);
902 NLIB_M(
void) F128::StoreHiA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
903 #ifdef NLIB_F128_SIMD_NOUSE 904 p[0] = value.vec.v[2];
905 p[1] = value.vec.v[3];
906 #elif defined(NLIB_SSE41) 907 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
908 #elif defined(NLIB_NEON) 909 vst1_f32(p, vget_high_f32(value));
911 p[0] = value.vec.ps[1][0];
912 p[1] = value.vec.ps[1][1];
917 NLIB_M(
void) F128::StoreHiA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
918 #ifdef NLIB_F128_SIMD_NOUSE 919 p[0] = value.vec.v[2];
920 p[1] = value.vec.v[3];
921 #elif defined(NLIB_SSE41) 922 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
923 #elif defined(NLIB_NEON) 924 float32x2_t tmp = vget_high_f32(value);
927 p[0] = value.vec.ps[1][0];
928 p[1] = value.vec.ps[1][1];
933 NLIB_M(
void) F128::StoreHiA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
934 StoreHiA8(reinterpret_cast<float*>(p), value);
938 NLIB_M(
void) F128::StoreHiA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
939 StoreHiA4(reinterpret_cast<float*>(p), value);
943 NLIB_M(
void) F128::StoreHiA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
944 StoreHiA8(reinterpret_cast<float*>(p), value);
948 NLIB_M(
void) F128::StoreHiA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
949 StoreHiA4(reinterpret_cast<float*>(p), value);
954 #ifdef NLIB_F128_SIMD_NOUSE 956 ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
957 ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
958 ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
959 ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
961 #elif defined(NLIB_NEON) 962 return vabsq_f32(value);
963 #elif defined(NLIB_SSE41) 964 const __m128 signmask = _mm_set1_ps(-0.0f);
965 return _mm_andnot_ps(signmask, value);
968 ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
969 ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
975 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b)
NLIB_NOEXCEPT {
976 #ifdef NLIB_F128_SIMD_NOUSE 978 result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
979 result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
980 result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
981 result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
983 #elif defined(NLIB_SSE41) 984 return _mm_blendv_ps(b, a, mask);
985 #elif defined(NLIB_NEON) 986 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
990 mask_.vec.u[0] &= 0xFF7FFFFFUL;
991 mask_.vec.u[1] &= 0xFF7FFFFFUL;
992 mask_.vec.u[2] &= 0xFF7FFFFFUL;
993 mask_.vec.u[3] &= 0xFF7FFFFFUL;
996 ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
997 ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
1049 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 1052 #if defined(NLIB_SSE41) 1053 return _mm_cvtepi32_ps(value);
1054 #elif defined(NLIB_NEON) 1055 return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1061 #if defined(NLIB_SSE41) 1062 return _mm_castsi128_ps(value);
1063 #elif defined(NLIB_NEON) 1064 return vreinterpretq_f32_s8(value);
1070 #if defined(NLIB_SSE41) 1071 return _mm_cvtps_epi32(value);
1072 #elif defined(NLIB_NEON) 1073 uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1074 uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1075 uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1076 w = vorrq_u32(w, half);
1077 return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1082 #if defined(NLIB_SSE41) 1083 return _mm_cvttps_epi32(value);
1084 #elif defined(NLIB_NEON) 1085 return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1091 #if defined(NLIB_SSE41) 1092 return _mm_castps_si128(value);
1093 #elif defined(NLIB_NEON) 1094 return vreinterpretq_s8_f32(value);
1099 NLIB_M(f128) F128::ConvertFromFixedPoint(i128arg value)
NLIB_NOEXCEPT {
1101 #if defined(NLIB_NEON) 1102 return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1104 f128 f = F128::ConvertFromI128(value);
1105 f128 m = F128::SetValue(((0x7F - N) << 23),
each_uint32);
1106 return F128::Mult(f, m);
1113 #if defined(NLIB_NEON) 1114 return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1116 f128 m = F128::SetValue(((0x7F + N) << 23),
each_uint32);
1117 f128 f = F128::Mult(value, m);
1118 return F128::ConvertToI128Truncate(f);
1125 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1126 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1128 ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1129 ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1130 ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1131 ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1133 #elif defined(NLIB_SSE41) 1134 return _mm_cmplt_ps(a, b);
1135 #elif defined(NLIB_NEON) 1136 uint32x4_t tmp = vcltq_f32(a, b);
1137 return vreinterpretq_f32_u32(tmp);
1142 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1143 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1145 ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1146 ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1147 ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1148 ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1150 #elif defined(NLIB_SSE41) 1151 return _mm_cmple_ps(a, b);
1152 #elif defined(NLIB_NEON) 1153 uint32x4_t tmp = vcleq_f32(a, b);
1154 return vreinterpretq_f32_u32(tmp);
1159 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1160 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1162 ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1163 ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1164 ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1165 ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1167 #elif defined(NLIB_SSE41) 1168 return _mm_cmpgt_ps(a, b);
1169 #elif defined(NLIB_NEON) 1170 uint32x4_t tmp = vcgtq_f32(a, b);
1171 return vreinterpretq_f32_u32(tmp);
1176 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1177 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1179 ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1180 ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1181 ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1182 ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1184 #elif defined(NLIB_SSE41) 1185 return _mm_cmpge_ps(a, b);
1186 #elif defined(NLIB_NEON) 1187 uint32x4_t tmp = vcgeq_f32(a, b);
1188 return vreinterpretq_f32_u32(tmp);
1193 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1194 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1196 ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1197 ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1198 ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1199 ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1201 #elif defined(NLIB_SSE41) 1202 return _mm_cmpneq_ps(a, b);
1203 #elif defined(NLIB_NEON) 1204 uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1205 return vreinterpretq_f32_u32(tmp);
1210 NLIB_M(f128) F128::Add(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1211 #ifdef NLIB_F128_SIMD_NOUSE 1213 ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1214 ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1215 ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1216 ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1218 #elif defined(NLIB_SSE41) 1219 return _mm_add_ps(a, b);
1220 #elif defined(NLIB_NEON) 1221 return vaddq_f32(a, b);
1224 ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1225 ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1231 NLIB_M(f128) F128::Sub(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1232 #ifdef NLIB_F128_SIMD_NOUSE 1234 ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1235 ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1236 ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1237 ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1239 #elif defined(NLIB_SSE41) 1240 return _mm_sub_ps(a, b);
1241 #elif defined(NLIB_NEON) 1242 return vsubq_f32(a, b);
1245 ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1246 ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1253 #ifdef NLIB_F128_SIMD_NOUSE 1255 ret.vec.v[0] = -value.vec.v[0];
1256 ret.vec.v[1] = -value.vec.v[1];
1257 ret.vec.v[2] = -value.vec.v[2];
1258 ret.vec.v[3] = -value.vec.v[3];
1260 #elif defined(NLIB_NEON) 1261 return vnegq_f32(value);
1262 #elif defined(NLIB_SSE41) 1263 const __m128 signmask = _mm_set1_ps(-0.0f);
1264 return _mm_xor_ps(signmask, value);
1267 ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1268 ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1274 NLIB_M(f128) F128::Mult(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1275 #ifdef NLIB_F128_SIMD_NOUSE 1277 ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1278 ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1279 ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1280 ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1282 #elif defined(NLIB_SSE41) 1283 return _mm_mul_ps(a, b);
1284 #elif defined(NLIB_NEON) 1285 return vmulq_f32(a, b);
1288 ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1289 ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1296 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1297 return vmulq_n_f32(b, a);
1298 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1300 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1301 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1304 return F128::Mult(b, F128::SetValue(a, each_float));
1311 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1313 return vmulq_laneq_f32(b, a, N);
1315 float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1316 return vmulq_n_f32(b, tmp);
1318 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1319 float t = a.vec.ps[N / 2][N % 2];
1321 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1322 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1330 NLIB_M(f128) F128::Div(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1331 #ifdef NLIB_F128_SIMD_NOUSE 1333 ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1334 ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1335 ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1336 ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1338 #elif defined(NLIB_SSE41) 1339 return _mm_div_ps(a, b);
1340 #elif defined(NLIB_NEON) 1342 return vdivq_f32(a, b);
1344 float32x4_t inv0 = vrecpeq_f32(b);
1345 float32x4_t step0 = vrecpsq_f32(inv0, b);
1346 float32x4_t inv1 = vmulq_f32(step0, inv0);
1347 float32x4_t step1 = vrecpsq_f32(inv1, b);
1348 float32x4_t inv2 = vmulq_f32(step1, inv1);
1349 uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1350 inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1351 return vmulq_f32(a, inv2);
1355 ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1356 ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1362 NLIB_M(f128) F128::Max(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1363 #ifdef NLIB_F128_SIMD_NOUSE 1365 ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1366 ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1367 ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1368 ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1370 #elif defined(NLIB_SSE41) 1371 return _mm_max_ps(a, b);
1372 #elif defined(NLIB_NEON) 1373 return vmaxq_f32(a, b);
1375 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1376 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1378 ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1379 ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1385 NLIB_M(f128) F128::Min(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1386 #ifdef NLIB_F128_SIMD_NOUSE 1388 ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1389 ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1390 ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1391 ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1393 #elif defined(NLIB_SSE41) 1394 return _mm_min_ps(a, b);
1395 #elif defined(NLIB_NEON) 1396 return vminq_f32(a, b);
1398 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1399 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1401 ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1402 ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1408 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1409 #ifdef NLIB_F128_SIMD_NOUSE 1411 ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1412 ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1413 ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1414 ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1416 #elif defined(NLIB_SSE41) 1417 f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1418 f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1419 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1420 #elif defined(NLIB_NEON) 1422 return vpmaxq_f32(a, b);
1424 float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1425 float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1426 return vcombine_f32(rl, rh);
1429 f32x2 v02, v13, cmp;
1431 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1432 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1433 cmp = __PS_SUB(v02, v13);
1434 ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1435 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1436 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1437 cmp = __PS_SUB(v02, v13);
1438 ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1444 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1445 #ifdef NLIB_F128_SIMD_NOUSE 1447 ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1448 ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1449 ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1450 ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1452 #elif defined(NLIB_SSE41) 1453 f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1454 f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1455 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1456 #elif defined(NLIB_NEON) 1458 return vpminq_f32(a, b);
1460 float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1461 float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1462 return vcombine_f32(rl, rh);
1465 f32x2 v02, v13, cmp;
1467 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1468 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1469 cmp = __PS_SUB(v02, v13);
1470 ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1471 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1472 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1473 cmp = __PS_SUB(v02, v13);
1474 ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1480 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1481 #ifdef NLIB_F128_SIMD_NOUSE 1483 ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1484 ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1485 ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1486 ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1488 #elif defined(NLIB_SSE41) 1489 return _mm_hadd_ps(a, b);
1490 #elif defined(NLIB_NEON) 1492 return vpaddq_f32(a, b);
1494 float32x2_t al = vget_low_f32(a);
1495 float32x2_t ah = vget_high_f32(a);
1496 float32x2_t l = vpadd_f32(al, ah);
1498 float32x2_t bl = vget_low_f32(b);
1499 float32x2_t bh = vget_high_f32(b);
1500 float32x2_t h = vpadd_f32(bl, bh);
1501 return vcombine_f32(l, h);
1504 f32x2 v02, v13, cmp;
1506 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1507 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1508 ret.vec.ps[0] = __PS_ADD(v02, v13);
1509 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1510 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1511 ret.vec.ps[1] = __PS_ADD(v02, v13);
1517 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1518 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1519 return vabdq_f32(a, b);
1521 return F128::Abs(F128::Sub(a, b));
1526 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1527 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1529 return vfmaq_f32(c, a, b);
1531 return vmlaq_f32(c, a, b);
1533 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1535 ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1536 ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1539 return F128::Add(c, F128::Mult(a, b));
1544 NLIB_M(f128) F128::MultAdd(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1545 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1547 return vfmaq_n_f32(c, b, a);
1549 return vmlaq_n_f32(c, b, a);
1552 return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1558 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1561 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1563 return vfmaq_laneq_f32(c, b, a, N);
1565 return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1568 return F128::MultAdd(F128::SetValue<N>(a,
each_select32), b, c);
1573 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1574 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1576 return vfmsq_f32(c, a, b);
1578 return vmlsq_f32(c, a, b);
1580 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1582 ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1583 ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1586 return F128::Sub(c, F128::Mult(a, b));
1591 NLIB_M(f128) F128::MultSub(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1592 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1594 return vfmsq_n_f32(c, b, a);
1596 return vmlsq_n_f32(c, b, a);
1599 return F128::MultSub(F128::SetValue(a, each_float), b, c);
1605 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1608 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1610 return vfmsq_laneq_f32(c, b, a, N);
1612 return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1615 return F128::MultSub(F128::SetValue<N>(a,
each_select32), b, c);
1620 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t)
NLIB_NOEXCEPT {
1622 return F128::MultAdd(t, F128::Sub(b, a), a);
1626 NLIB_M(f128) F128::And(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1627 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1629 ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1630 ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1631 ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1632 ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1634 #elif defined(NLIB_SSE41) 1635 return _mm_and_ps(a, b);
1636 #elif defined(NLIB_NEON) 1637 uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1638 return vreinterpretq_f32_u32(tmp);
1643 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT {
1646 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1649 f128 sum = F128::Add(angle1, angle2);
1650 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1651 f128 ofs = F128::And(cond, pi_dbl);
1652 f128 result = F128::Add(sum, ofs);
1653 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1654 ofs = F128::And(cond, pi_dbl);
1655 return F128::Sub(result, ofs);
1659 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT {
1662 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1665 f128 sum = F128::Sub(angle1, angle2);
1666 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1667 f128 ofs = F128::And(cond, pi_dbl);
1668 f128 result = F128::Add(sum, ofs);
1669 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1670 ofs = F128::And(cond, pi_dbl);
1671 return F128::Sub(result, ofs);
1678 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1685 f128 tt = F128::Mult(t, t);
1686 f128 ttt = F128::Mult(tt, t);
1687 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1688 f128 hermite_R0 = vcombine_f32(vcreate_f32(0x3F80000040000000ULL),
1689 vcreate_f32(0x3F800000C0000000ULL));
1690 f128 hermite_R1 = vcombine_f32(vcreate_f32(0xC0000000C0400000ULL),
1691 vcreate_f32(0xBF80000040400000ULL));
1693 f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1694 f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1697 ttt = F128::Mult(ttt, hermite_R0);
1698 ttt = F128::MultAdd(tt, hermite_R1, ttt);
1699 ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1700 ttt = F128::Add(ttt, F128::Set1000());
1714 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1716 f128 tt = F128::Mult(t, t);
1717 f128 ttt = F128::Mult(tt, t);
1718 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1719 f128 catmull_R0 = vcombine_f32(vcreate_f32(0x40400000BF800000ULL),
1720 vcreate_f32(0x3F800000C0400000ULL));
1721 f128 catmull_R1 = vcombine_f32(vcreate_f32(0xC0A0000040000000ULL),
1722 vcreate_f32(0xBF80000040800000ULL));
1723 f128 catmull_R2 = vcombine_f32(vcreate_f32(0x00000000BF800000ULL),
1724 vcreate_f32(0x000000003F800000ULL));
1726 f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1727 f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1728 f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1730 ttt = F128::Mult(ttt, catmull_R0);
1731 ttt = F128::MultAdd(tt, catmull_R1, ttt);
1732 ttt = F128::MultAdd(t, catmull_R2, ttt);
1733 ttt = F128::Add(ttt, F128::Set0100());
1745 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1747 f128 p1p0 = F128::Sub(p1, p0);
1748 f128 p2p0 = F128::Sub(p2, p0);
1749 f128 tmp = F128::MultAdd(f, p1p0, p0);
1750 return F128::MultAdd(g, p2p0, tmp);
1755 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1757 ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1758 ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1759 ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1760 ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1762 #elif defined(NLIB_SSE41) 1763 return _mm_or_ps(a, b);
1764 #elif defined(NLIB_NEON) 1765 uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1766 return vreinterpretq_f32_u32(tmp);
1771 NLIB_M(f128) F128::Xor(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1772 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1774 ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1775 ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1776 ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1777 ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1779 #elif defined(NLIB_SSE41) 1780 return _mm_xor_ps(a, b);
1781 #elif defined(NLIB_NEON) 1782 uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1783 return vreinterpretq_f32_u32(tmp);
1789 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1791 ret.vec.u[0] = ~a.vec.u[0];
1792 ret.vec.u[1] = ~a.vec.u[1];
1793 ret.vec.u[2] = ~a.vec.u[2];
1794 ret.vec.u[3] = ~a.vec.u[3];
1796 #elif defined(NLIB_SSE41) 1797 return _mm_andnot_ps(a, F128::CmpEq(a, a));
1798 #elif defined(NLIB_NEON) 1799 uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1800 return vreinterpretq_f32_u32(tmp);
1805 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1806 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1808 ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1809 ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1810 ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1811 ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1813 #elif defined(NLIB_SSE41) 1814 return _mm_andnot_ps(a, b);
1815 #elif defined(NLIB_NEON) 1816 uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1817 return vreinterpretq_f32_u32(tmp);
1822 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1823 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1825 ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1826 ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1827 ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1828 ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1830 #elif defined(NLIB_SSE41) 1831 return _mm_or_ps(F128::Not(a), b);
1832 #elif defined(NLIB_NEON) 1833 uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1834 return vreinterpretq_f32_u32(tmp);
1839 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1840 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1842 ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1843 ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1844 ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1845 ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1847 #elif defined(NLIB_SSE41) 1848 return _mm_cmpeq_ps(a, b);
1849 #elif defined(NLIB_NEON) 1850 uint32x4_t tmp = vceqq_f32(a, b);
1851 return vreinterpretq_f32_u32(tmp);
1856 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps)
NLIB_NOEXCEPT {
1857 f128 tmp = F128::AbsDiff(a, b);
1858 return F128::CmpLe(tmp, eps);
1862 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max)
NLIB_NOEXCEPT {
1863 return F128::Min(max, F128::Max(min, value));
1867 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds)
NLIB_NOEXCEPT {
1868 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1869 uint32x4_t tmp = vcaleq_f32(value, bounds);
1870 return vreinterpretq_f32_u32(tmp);
1872 return F128::CmpLe(F128::Abs(value), bounds);
1877 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1878 return vreinterpretq_f32_u32(vceqzq_f32(value));
1880 return F128::CmpEq(value, F128::SetZero());
1885 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1886 return vreinterpretq_f32_u32(vcltzq_f32(value));
1888 return F128::CmpLt(value, F128::SetZero());
1893 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1894 return vreinterpretq_f32_u32(vclezq_f32(value));
1896 return F128::CmpLe(value, F128::SetZero());
1901 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1902 return vreinterpretq_f32_u32(vcgtzq_f32(value));
1904 return F128::CmpGt(value, F128::SetZero());
1909 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1910 return vreinterpretq_f32_u32(vcgezq_f32(value));
1912 return F128::CmpGe(value, F128::SetZero());
1917 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1918 return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1920 return F128::CmpNe(value, F128::SetZero());
1925 NLIB_M(f128) F128::CmpNearEqZero(f128arg value, f128arg eps)
NLIB_NOEXCEPT {
1926 f128 tmp = F128::Abs(value);
1927 return F128::CmpLe(tmp, eps);
1932 #ifdef NLIB_F128_SIMD_NOUSE 1934 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1935 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1936 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1937 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1939 #elif defined(NLIB_SSE41) 1940 return _mm_div_ps(F128::SetOne(), value);
1941 #elif defined(NLIB_NEON) 1943 return vdivq_f32(vdupq_n_f32(1.f), value);
1946 x = vrecpeq_f32(value);
1947 x = vmulq_f32(x, vrecpsq_f32(x, value));
1948 x = vmulq_f32(x, vrecpsq_f32(x, value));
1949 uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1950 float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1954 return F128::Div(F128::SetOne(), value);
1960 #ifdef NLIB_F128_SIMD_NOUSE 1962 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1963 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1964 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1965 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1967 #elif defined(NLIB_SSE41) 1968 return _mm_rcp_ps(value);
1969 #elif defined(NLIB_NEON) 1970 return vrecpeq_f32(value);
1973 ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1974 ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1981 #ifdef NLIB_F128_SIMD_NOUSE 1983 ret.vec.v[0] = sqrtf(value.vec.v[0]);
1984 ret.vec.v[1] = sqrtf(value.vec.v[1]);
1985 ret.vec.v[2] = sqrtf(value.vec.v[2]);
1986 ret.vec.v[3] = sqrtf(value.vec.v[3]);
1988 #elif defined(NLIB_SSE41) 1989 return _mm_sqrt_ps(value);
1990 #elif defined(NLIB_NEON) 1991 f128 iszero = F128::CmpEqZero(value);
1992 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1993 return F128::AndNot(iszero, result);
1995 f128 zero = F128::SetZero();
1996 f128 iszero = F128::CmpEq(zero, value);
1997 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1998 return F128::Select(iszero, zero, result);
2004 #ifdef NLIB_F128_SIMD_NOUSE 2006 ret.vec.v[0] = sqrtf(value.vec.v[0]);
2007 ret.vec.v[1] = sqrtf(value.vec.v[1]);
2008 ret.vec.v[2] = sqrtf(value.vec.v[2]);
2009 ret.vec.v[3] = sqrtf(value.vec.v[3]);
2011 #elif defined(NLIB_SSE41) 2012 return _mm_sqrt_ps(value);
2013 #elif defined(NLIB_NEON) 2014 return vrecpeq_f32(vrsqrteq_f32(value));
2017 ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2018 ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2025 #ifdef NLIB_F128_SIMD_NOUSE 2027 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2028 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2029 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2030 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2032 #elif defined(NLIB_SSE41) 2033 return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2034 #elif defined(NLIB_NEON) 2036 x = vrsqrteq_f32(value);
2037 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2038 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2039 f128 zeromask = F128::CmpEqZero(value);
2040 return F128::Select(zeromask, F128::SetInfinity(), x);
2042 f32x2 three = __PS_FDUP(3.f);
2043 f32x2 half = __PS_FDUP(0.5f);
2049 v = value.vec.ps[0];
2052 xx = __PS_MUL(x, x);
2053 xx = __PS_NMSUB(v, xx, three);
2054 xx = __PS_MUL(x, xx);
2055 x = __PS_MUL(half, xx);
2057 xx = __PS_MUL(x, x);
2058 xx = __PS_NMSUB(v, xx, three);
2059 xx = __PS_MUL(x, xx);
2060 ret.vec.ps[0] = __PS_MUL(half, xx);
2062 v = value.vec.ps[1];
2065 xx = __PS_MUL(x, x);
2066 xx = __PS_NMSUB(v, xx, three);
2067 xx = __PS_MUL(x, xx);
2068 x = __PS_MUL(half, xx);
2070 xx = __PS_MUL(x, x);
2071 xx = __PS_NMSUB(v, xx, three);
2072 xx = __PS_MUL(x, xx);
2073 ret.vec.ps[1] = __PS_MUL(half, xx);
2075 f128 iszero = F128::CmpEq(F128::SetZero(), value);
2076 f128 inf = F128::SetInfinity();
2077 return F128::Select(iszero, inf, ret);
2082 NLIB_M(f128) F128::RecpSqrtEst(f128arg value)
NLIB_NOEXCEPT {
2083 #ifdef NLIB_F128_SIMD_NOUSE 2085 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2086 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2087 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2088 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2090 #elif defined(NLIB_SSE41) 2091 return _mm_rsqrt_ps(value);
2092 #elif defined(NLIB_NEON) 2093 return vrsqrteq_f32(value);
2096 ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2097 ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2102 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
2104 const size_t lane0 = NegateLane0 ? 4 : 0;
2105 const size_t lane1 = NegateLane1 ? 5 : 1;
2106 const size_t lane2 = NegateLane2 ? 6 : 2;
2107 const size_t lane3 = NegateLane3 ? 7 : 3;
2108 return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2112 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value)
NLIB_NOEXCEPT {
2117 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value)
NLIB_NOEXCEPT {
2118 return F128::Negate(value);
2121 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2122 #define NLIB_ISNAN(vec, idx) \ 2123 ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0) 2124 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U) 2129 #if defined(NLIB_F128_SIMD_NOUSE) 2131 ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2132 ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2133 ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2134 ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2138 f32x2 one = __PS_FDUP(1.f);
2139 f32x2 minus_one = __PS_NEG(one);
2140 f32x2 v0 = value.vec.ps[0];
2141 f32x2 v1 = value.vec.ps[1];
2142 f32x2 t0 = __PS_SEL(v0, one, minus_one);
2143 f32x2 t1 = __PS_SEL(v1, one, minus_one);
2145 f32x2 v0neg = __PS_NEG(v0);
2146 f32x2 v1neg = __PS_NEG(v1);
2147 ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2148 ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2151 return F128::CmpNe(value, value);
2156 NLIB_M(f128) F128::IsInfinite(f128arg value)
NLIB_NOEXCEPT {
2157 #if defined(NLIB_F128_SIMD_NOUSE) 2159 ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2160 ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2161 ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2162 ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2166 f32x2 big_value = __PS_FDUP(FLT_MAX);
2167 ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2168 ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2171 f128 inf_value = F128::SetInfinity();
2172 f128 abs_value = F128::Abs(value);
2173 return F128::CmpEq(inf_value, abs_value);
2179 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 2180 return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2181 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE) 2182 return vrndaq_f32(value);
2185 f128 sgn = F128::And(value, F128::SetSignMask());
2186 f128 sm = F128::Or(F128::SetValue(0x4B000000U,
each_uint32), sgn);
2187 f128 result = F128::Sub(F128::Add(value, sm), sm);
2196 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2198 for (
size_t i = 0; i < 4; ++i) {
2199 if (NLIB_ISNAN(value.vec, i)) {
2200 ret.vec.u[i] = 0x7FC00000U;
2202 ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2203 ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2208 #elif defined(NLIB_SSE41) 2209 return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2210 #elif defined(NLIB_NEON) 2212 f128 x = F128::Abs(value);
2213 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2214 f128 cond = F128::CmpLt(x, c_2_23);
2215 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2216 return F128::Select(cond, casted, value);
2218 return vrndq_f32(value);
2225 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2227 ret.vec.v[0] = floorf(value.vec.v[0]);
2228 ret.vec.v[1] = floorf(value.vec.v[1]);
2229 ret.vec.v[2] = floorf(value.vec.v[2]);
2230 ret.vec.v[3] = floorf(value.vec.v[3]);
2232 #elif defined(NLIB_SSE41) 2233 return _mm_floor_ps(value);
2234 #elif defined(NLIB_NEON) 2238 f128 x = F128::Abs(value);
2239 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2240 f128 cond = F128::CmpLt(x, c_2_23);
2241 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2244 f128 largeMask = F128::CmpGt(casted, value);
2246 casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(largeMask)));
2247 return F128::Select(cond, casted, value);
2249 return vrndmq_f32(value);
2256 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2258 ret.vec.v[0] = ceilf(value.vec.v[0]);
2259 ret.vec.v[1] = ceilf(value.vec.v[1]);
2260 ret.vec.v[2] = ceilf(value.vec.v[2]);
2261 ret.vec.v[3] = ceilf(value.vec.v[3]);
2263 #elif defined(NLIB_SSE41) 2264 return _mm_ceil_ps(value);
2265 #elif defined(NLIB_NEON) 2269 f128 x = F128::Abs(value);
2270 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2271 f128 cond = F128::CmpLt(x, c_2_23);
2272 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2275 f128 smallMask = F128::CmpLt(casted, value);
2277 casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(smallMask)));
2278 return F128::Select(cond, casted, value);
2280 return vrndpq_f32(value);
2285 #ifdef NLIB_F128_SIMD_NOUSE 2292 return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2296 static const float v_1_2pi = 0.15915494309189535f;
2297 static const float v_2pi = 6.283185307179586f;
2299 const f128 recpTwoPi = F128::SetValue(v_1_2pi, each_float);
2300 f128 round = F128::Round(F128::Mult(value, recpTwoPi));
2301 const f128 twoPi = F128::SetValue(v_2pi, each_float);
2302 return F128::MultSub(twoPi, round, value);
2307 f128 x = F128::ModAngle(value);
2314 f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2318 f128 xabs = F128::Abs(value);
2319 f128 xsign = F128::And(F128::SetSignMask(), x);
2320 f128 mypi = F128::Or(xsign, pi);
2321 f128 pi_x = F128::Sub(mypi, x);
2322 f128 cond = F128::CmpLe(xabs, pidiv2);
2323 x = F128::Select(cond, x, pi_x);
2325 f128 xx = F128::Mult(x, x);
2326 f128 coeff = F128::LoadA16(sin_coeff_);
2330 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2331 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2332 result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue,
each_select32));
2333 result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue,
each_select32));
2334 result = F128::Mult(xx, result);
2335 result = F128::MultSub(result, x, x);
2341 f128 x = F128::ModAngle(value);
2348 f128 cvalue = F128::LoadA16(cos_cvalue_);
2350 f128 xabs = F128::Abs(value);
2351 f128 xsign = F128::And(F128::SetSignMask(), x);
2352 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2353 f128 pi_x = F128::Sub(mypi, x);
2354 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2355 x = F128::Select(cond, x, pi_x);
2358 f128 sign = F128::AndNot(cond, F128::SetSignMask());
2362 f128 xx = F128::Mult(x, x);
2363 f128 coeff = F128::LoadA16(cos_coeff_);
2367 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2368 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2369 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2370 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2371 result = F128::MultSub(xx, result, F128::SetOne());
2372 result = F128::Xor(sign, result);
2378 const f128 signmask = F128::SetSignMask();
2379 f128 x = F128::ModAngle(value);
2386 f128 cvalue = F128::LoadA16(cos_cvalue_);
2388 f128 xabs = F128::Abs(value);
2389 f128 xsign = F128::And(signmask, x);
2390 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2391 f128 pi_x = F128::Sub(mypi, x);
2392 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2393 x = F128::Select(cond, x, pi_x);
2396 f128 sign = F128::AndNot(cond, signmask);
2400 f128 xx = F128::Mult(x, x);
2405 f128 coeff = F128::LoadA16(cos_coeff_);
2410 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2411 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2412 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2413 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2414 result = F128::MultSub(xx, result, F128::SetOne());
2416 ret.val[1] = F128::Xor(sign, result);
2421 f128 coeff = F128::LoadA16(sin_coeff_);
2426 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2427 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2428 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2], each_float));
2429 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3], each_float));
2430 result = F128::Mult(xx, result);
2431 ret.val[0] = F128::MultSub(result, x, x);
2440 f128 cmp, value_sign;
2442 f128 one = F128::SetOne();
2447 value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2448 cmp = F128::CmpLe(F128::Abs(value), one);
2450 f128 x = F128::Select(cmp, value, F128::Recp(value));
2457 f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2458 f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2459 f128 xx = F128::Mult(x, x);
2462 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1,
each_select32));
2463 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1,
each_select32));
2464 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0,
each_select32));
2465 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0,
each_select32));
2466 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0,
each_select32));
2467 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0,
each_select32));
2469 result = F128::Mult(result, x);
2470 result = F128::MultSub(xx, result, x);
2472 f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2473 f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2474 result = F128::Select(cmp, result, result_another);
2478 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x)
NLIB_NOEXCEPT {
2495 const f128 signmask = F128::SetSignMask();
2497 const f128 sy = F128::And(y, signmask);
2498 const f128 infx = F128::IsInfinite(x);
2499 const f128 infy = F128::IsInfinite(y);
2500 const f128 zerox = F128::CmpEqZero(x);
2501 const f128 zeroy = F128::CmpEqZero(y);
2502 const f128 posx = F128::CmpGtZero(x);
2512 const f128 cval = F128::LoadA16(atan2_cvalue_);
2513 const f128 pi = F128::Or(sy, F128::SetValue<0>(cval,
each_select32));
2514 const f128 pi_34 = F128::Or(sy, F128::SetValue<1>(cval,
each_select32));
2515 const f128 pi_2 = F128::Or(sy, F128::SetValue<2>(cval,
each_select32));
2516 const f128 pi_4 = F128::Or(sy, F128::SetValue<3>(cval,
each_select32));
2518 f128 v = F128::Select(
2519 infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2520 F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2525 #if defined(NLIB_F128_SIMD_NOUSE) 2527 mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2528 mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2529 mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2530 mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2534 mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2535 mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2536 mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2537 mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2539 f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v),
2542 f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2543 return F128::Select(mask, result, v);
2548 f128 one = F128::SetOne();
2549 f128 tmp = F128::MultSub(value, value, one);
2550 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2551 return F128::ArcTan2(value, argx);
2556 f128 one = F128::SetOne();
2557 f128 tmp = F128::MultSub(value, value, one);
2558 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2559 return F128::ArcTan2(argx, value);
2564 #ifdef NLIB_F128_SIMD_NOUSE 2566 ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2567 ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2568 ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2569 ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2571 #elif defined(NLIB_SSE41) 2572 return static_cast<uint8_t
>(_mm_movemask_ps(value));
2573 #elif defined(NLIB_NEON) 2574 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2575 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2576 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2577 uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2579 return vaddvq_u32(a);
2581 uint16x4_t tmp = vmovn_u32(a);
2582 tmp = vpadd_u16(tmp, tmp);
2583 tmp = vpadd_u16(tmp, tmp);
2584 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2587 int tmp = (value.vec.u[0] >> 31);
2588 tmp |= (value.vec.u[1] >> 30) & 2;
2589 tmp |= (value.vec.u[2] >> 29) & 4;
2590 tmp |= (value.vec.u[3] >> 28) & 8;
2596 NLIB_M2(
bool) F128::IsAllMaskFalse(f128arg value)
NLIB_NOEXCEPT {
2597 #ifdef NLIB_F128_SIMD_NOUSE 2598 return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2599 #elif defined(NLIB_SSE41) 2600 i128 casted = F128::CastToI128(value);
2601 return _mm_testz_si128(casted, casted) != 0;
2602 #elif defined(NLIB_NEON) 2604 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2605 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2607 int32x4_t casted = vreinterpretq_s32_f32(value);
2608 int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2609 return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2612 uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2613 return (tmp & 0x80000000U) == 0;
2618 NLIB_M2(
bool) F128::IsAllMaskTrue(f128arg value)
NLIB_NOEXCEPT {
2619 #ifdef NLIB_F128_SIMD_NOUSE 2620 return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2621 value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2622 #elif defined(NLIB_SSE41) 2623 i128 casted = F128::CastToI128(value);
2624 return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2625 #elif defined(NLIB_NEON) 2627 uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2628 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2630 int32x4_t casted = vreinterpretq_s32_f32(value);
2631 int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2632 return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2635 uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2636 return (tmp & 0x80000000U) != 0;
2642 NLIB_M(
float) F128::GetFloatFromLane(f128arg value)
NLIB_NOEXCEPT {
2644 #ifdef NLIB_F128_SIMD_NOUSE 2645 return value.vec.v[N];
2646 #elif defined(NLIB_SSE41) 2648 _MM_EXTRACT_FLOAT(dest, value, N);
2650 #elif defined(NLIB_NEON) 2651 return vgetq_lane_f32(value, N);
2653 return value.vec.ps[N / 2][N % 2];
2659 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value)
NLIB_NOEXCEPT {
2661 #ifdef NLIB_F128_SIMD_NOUSE 2662 return value.vec.u[N];
2663 #elif defined(NLIB_SSE41) 2664 return _mm_extract_ps(value, N);
2665 #elif defined(NLIB_NEON) 2666 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2667 return vgetq_lane_u32(tmp, N);
2669 return value.vec.u[N];
2674 NLIB_M2(
float) F128::GetFloatByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT {
2675 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2676 return value.vec.v[idx];
2677 #elif defined(NLIB_SSE41) 2681 _MM_EXTRACT_FLOAT(dest, value, 0);
2684 _MM_EXTRACT_FLOAT(dest, value, 1);
2687 _MM_EXTRACT_FLOAT(dest, value, 2);
2690 _MM_EXTRACT_FLOAT(dest, value, 3);
2697 #elif defined(NLIB_NEON) 2700 return vgetq_lane_f32(value, 0);
2702 return vgetq_lane_f32(value, 1);
2704 return vgetq_lane_f32(value, 2);
2706 return vgetq_lane_f32(value, 3);
2715 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT {
2716 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2717 return value.vec.u[idx];
2718 #elif defined(NLIB_SSE41) 2721 return static_cast<uint32_t
>(_mm_extract_ps(value, 0));
2723 return static_cast<uint32_t
>(_mm_extract_ps(value, 1));
2725 return static_cast<uint32_t
>(_mm_extract_ps(value, 2));
2727 return static_cast<uint32_t
>(_mm_extract_ps(value, 3));
2732 #elif defined(NLIB_NEON) 2733 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2736 return vgetq_lane_u32(tmp, 0);
2738 return vgetq_lane_u32(tmp, 1);
2740 return vgetq_lane_u32(tmp, 2);
2742 return vgetq_lane_u32(tmp, 3);
2752 NLIB_M(f128) F128::SetFloatToLane(f128arg value,
float v)
NLIB_NOEXCEPT {
2754 #ifdef NLIB_F128_SIMD_NOUSE 2758 #elif defined(NLIB_SSE41) 2759 f128 tmp = _mm_set_ss(v);
2760 return _mm_insert_ps(value, tmp, N << 4);
2761 #elif defined(NLIB_NEON) 2762 return __builtin_constant_p(v) ?
2763 F128::Permute<N == 0 ? 4 : 0,
2766 N == 3 ? 7 : 3>(value, vdupq_n_f32(v)) :
2767 vsetq_lane_f32(v, value, N);
2770 ret.vec.ps[N / 2][N % 2] = v;
2776 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value,
float v,
size_t i)
NLIB_NOEXCEPT {
2777 #ifdef NLIB_F128_SIMD_NOUSE 2781 #elif defined(NLIB_SSE41) 2782 f128 tmp = _mm_set_ss(v);
2785 return _mm_insert_ps(value, tmp, 0x00);
2787 return _mm_insert_ps(value, tmp, 0x10);
2789 return _mm_insert_ps(value, tmp, 0x20);
2791 return _mm_insert_ps(value, tmp, 0x30);
2796 #elif defined(NLIB_NEON) 2799 return F128::SetFloatToLane<0>(value, v);
2801 return F128::SetFloatToLane<1>(value, v);
2803 return F128::SetFloatToLane<2>(value, v);
2805 return F128::SetFloatToLane<3>(value, v);
2814 ret.vec.ps[0][0] = v;
2817 ret.vec.ps[0][1] = v;
2820 ret.vec.ps[1][0] = v;
2823 ret.vec.ps[1][1] = v;
2830 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 2833 template <
bool IsHighA,
bool IsHighB>
2834 float32x2_t F64Merge(float32x2_t a, float32x2_t b)
NLIB_NOEXCEPT;
2837 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2839 return vtrn1_f32(a, b);
2841 return vtrn_f32(a, b).val[0];
2846 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2848 return vtrn1_f32(vrev64_f32(a), b);
2850 return vtrn_f32(vrev64_f32(a), b).val[0];
2855 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2857 return vtrn1_f32(a, vrev64_f32(b));
2859 return vtrn_f32(a, vrev64_f32(b)).val[0];
2864 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2866 return vtrn2_f32(a, b);
2868 return vtrn_f32(a, b).val[1];
2877 return vget_low_f32(value);
2882 return vget_high_f32(value);
2885 template <
int X0,
int X1>
2886 struct F128SwizzleHelper2 {
2888 float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2889 float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2890 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2895 struct F128SwizzleHelper2<X, X> {
2897 float32x2_t x = F128SwizzleGet64<X / 2>(value);
2898 return vdup_lane_f32(x, (X & 1));
2903 struct F128SwizzleHelper2<0, 1> {
2905 return vget_low_f32(value);
2910 struct F128SwizzleHelper2<0, 2> {
2913 return vget_low_f32(vuzp1q_f32(value, value));
2915 float32x2_t lo = vget_low_f32(value);
2916 float32x2_t hi = vget_high_f32(value);
2917 return vzip_f32(lo, hi).val[0];
2923 struct F128SwizzleHelper2<0, 3> {
2925 float32x2_t lo = vget_low_f32(value);
2926 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2928 return vzip1_f32(lo, hi);
2930 return vzip_f32(lo, hi).val[0];
2936 struct F128SwizzleHelper2<1, 0> {
2938 return vrev64_f32(vget_low_f32(value));
2943 struct F128SwizzleHelper2<1, 2> {
2945 float32x2_t lo = vget_low_f32(value);
2946 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2948 return vzip2_f32(lo, hi);
2950 return vzip_f32(lo, hi).val[1];
2956 struct F128SwizzleHelper2<1, 3> {
2959 return vget_low_f32(vuzp2q_f32(value, value));
2961 float32x2_t lo = vget_low_f32(value);
2962 float32x2_t hi = vget_high_f32(value);
2963 return vzip_f32(lo, hi).val[1];
2969 struct F128SwizzleHelper2<2, 0> {
2972 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2974 float32x2_t lo = vget_low_f32(value);
2975 float32x2_t hi = vget_high_f32(value);
2976 return vzip_f32(hi, lo).val[0];
2982 struct F128SwizzleHelper2<2, 1> {
2985 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2987 float32x2_t lo = vget_low_f32(value);
2988 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2989 return vzip_f32(hi, lo).val[1];
2995 struct F128SwizzleHelper2<2, 3> {
2997 return vget_high_f32(value);
3002 struct F128SwizzleHelper2<3, 0> {
3004 float32x2_t lo = vget_low_f32(value);
3005 float32x2_t hi = vrev64_f32(vget_high_f32(value));
3007 return vzip1_f32(hi, lo);
3009 return vzip_f32(hi, lo).val[0];
3015 struct F128SwizzleHelper2<3, 1> {
3017 float32x2_t lo = vget_low_f32(value);
3018 float32x2_t hi = vget_high_f32(value);
3020 return vzip2_f32(hi, lo);
3022 return vzip_f32(hi, lo).val[1];
3028 struct F128SwizzleHelper2<3, 2> {
3030 return vrev64_f32(vget_high_f32(value));
3034 template <
int V0,
int V1,
int V2,
int V3>
3035 struct F128SwizzleHelper {
3037 return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3038 detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3042 template <
int Vx,
int Vy>
3043 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3045 float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3046 return vcombine_f32(tmp, tmp);
3051 struct F128SwizzleHelper<V, V, V, V> {
3058 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3061 template <
int X0,
int X1>
3062 struct F128SwizzleHelper {
3067 struct F128SwizzleHelper<0, 0> {
3070 return __PS_MERGE00(v0, v0);
3075 struct F128SwizzleHelper<0, 1> {
3083 struct F128SwizzleHelper<0, 2> {
3085 return __PS_MERGE00(v0, v1);
3090 struct F128SwizzleHelper<0, 3> {
3092 return __PS_MERGE01(v0, v1);
3097 struct F128SwizzleHelper<1, 0> {
3100 return __PS_MERGE10(v0, v0);
3105 struct F128SwizzleHelper<1, 1> {
3108 return __PS_MERGE11(v0, v0);
3113 struct F128SwizzleHelper<1, 2> {
3115 return __PS_MERGE10(v0, v1);
3120 struct F128SwizzleHelper<1, 3> {
3122 return __PS_MERGE11(v0, v1);
3127 struct F128SwizzleHelper<2, 0> {
3129 return __PS_MERGE00(v1, v0);
3134 struct F128SwizzleHelper<2, 1> {
3136 return __PS_MERGE01(v1, v0);
3141 struct F128SwizzleHelper<2, 2> {
3144 return __PS_MERGE00(v1, v1);
3149 struct F128SwizzleHelper<2, 3> {
3157 struct F128SwizzleHelper<3, 0> {
3159 return __PS_MERGE10(v1, v0);
3164 struct F128SwizzleHelper<3, 1> {
3166 return __PS_MERGE11(v1, v0);
3171 struct F128SwizzleHelper<3, 2> {
3174 return __PS_MERGE10(v1, v1);
3179 struct F128SwizzleHelper<3, 3> {
3182 return __PS_MERGE11(v1, v1);
3189 template <
int V0,
int V1,
int V2,
int V3>
3196 #if defined(NLIB_F128_SIMD_NOUSE) 3198 ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3199 ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3200 ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3201 ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3203 #elif __has_builtin(__builtin_shufflevector) 3204 return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3205 #elif defined(NLIB_SSE41) 3206 return _mm_shuffle_ps(value, value,
3207 _MM_SHUFFLE(V3 != -1 ? V3 : 3,
3210 V0 != -1 ? V0 : 0));
3211 #elif defined(NLIB_NEON) 3212 return detail::F128SwizzleHelper<
3216 V3 != -1 ? V3 : 3>::Swizzle(value);
3219 ret.vec.ps[0] = detail::F128SwizzleHelper<
3220 (V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3221 ret.vec.ps[1] = detail::F128SwizzleHelper<
3222 (V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3227 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3230 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value)
NLIB_NOEXCEPT {
3232 return vzip1q_f32(value, value);
3234 return vzipq_f32(value, value).val[0];
3238 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value)
NLIB_NOEXCEPT {
3240 return vtrn1q_f32(value, value);
3242 return vtrnq_f32(value, value).val[0];
3246 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value)
NLIB_NOEXCEPT {
3250 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value)
NLIB_NOEXCEPT {
3252 return vuzp1q_f32(value, value);
3254 return vuzpq_f32(value, value).val[0];
3258 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value)
NLIB_NOEXCEPT {
3259 return vrev64q_f32(value);
3262 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value)
NLIB_NOEXCEPT {
3264 return vtrn2q_f32(value, value);
3266 return vtrnq_f32(value, value).val[1];
3270 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value)
NLIB_NOEXCEPT {
3271 uint32x4_t ival = vreinterpretq_u32_f32(value);
3272 uint32x4_t rotated = vextq_u32(ival, ival, 1);
3273 return vreinterpretq_f32_u32(rotated);
3276 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value)
NLIB_NOEXCEPT {
3278 return vuzp2q_f32(value, value);
3280 return vuzpq_f32(value, value).val[1];
3284 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value)
NLIB_NOEXCEPT {
3286 return vzip2q_f32(value, value);
3288 return vzipq_f32(value, value).val[1];
3292 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value)
NLIB_NOEXCEPT {
3293 uint32x4_t ival = vreinterpretq_u32_f32(value);
3294 uint32x4_t rotated = vextq_u32(ival, ival, 2);
3295 return vreinterpretq_f32_u32(rotated);
3298 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value)
NLIB_NOEXCEPT {
3299 uint32x4_t ival = vreinterpretq_u32_f32(value);
3300 uint32x4_t rotated = vextq_u32(ival, ival, 3);
3301 return vreinterpretq_f32_u32(rotated);
3307 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3308 template <
bool UseBlend,
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3309 struct F128PermuteHelper2 {
3310 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3311 f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3312 f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3313 return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3314 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3318 template <
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3319 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3320 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3321 return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3322 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3326 template <
int V0,
int V1,
int V2,
int V3>
3327 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3328 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3329 return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3330 _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3335 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3336 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3337 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3338 return _mm_castsi128_ps(tmp);
3343 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3344 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3345 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3346 return _mm_castsi128_ps(tmp);
3351 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3352 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3353 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3354 return _mm_castsi128_ps(tmp);
3359 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3360 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3362 return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3367 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3368 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3370 return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3375 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3376 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3378 return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3383 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3384 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3386 return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3391 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3392 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3394 return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3399 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3400 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3402 return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3407 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3408 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3410 return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3415 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3416 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3418 return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3422 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3423 struct F128PermuteHelper {
3424 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3425 return F128PermuteHelper2<
3426 ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3427 ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3428 V0, V1, V2, V3>::Permute(a, b);
3432 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3435 float32x2_t F128PermuteGet64(f128arg a, f128arg b)
NLIB_NOEXCEPT;
3438 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3440 return vget_low_f32(a);
3443 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3445 return vget_high_f32(a);
3448 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3450 return vget_low_f32(b);
3453 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3455 return vget_high_f32(b);
3458 template <
int X0,
int X1>
3459 struct F128PermuteHelper2 {
3461 float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3462 float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3463 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3468 struct F128PermuteHelper2<X, X> {
3470 float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3471 return vdup_lane_f32(x, (X & 1));
3475 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3476 struct F128PermuteHelper {
3477 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3478 return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3479 F128PermuteHelper2<V2, V3>::Permute(a, b));
3484 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3485 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3486 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3487 return vreinterpretq_f32_s32(tmp);
3492 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3493 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3494 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3495 return vreinterpretq_f32_s32(tmp);
3500 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3501 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3502 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3503 return vreinterpretq_f32_s32(tmp);
3506 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3507 template<
int R0,
int R1,
int VAR0,
int VAR1>
3508 struct F128PermuteHelper2 {
3509 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT;
3512 template<
int R0,
int R1>
3513 struct F128PermuteHelper2<R0, R1, 0, 0> {
3514 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3515 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3519 template<
int R0,
int R1>
3520 struct F128PermuteHelper2<R0, R1, 0, 1> {
3521 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3522 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3526 template<
int R0,
int R1>
3527 struct F128PermuteHelper2<R0, R1, 0, 2> {
3528 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3529 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3533 template<
int R0,
int R1>
3534 struct F128PermuteHelper2<R0, R1, 0, 3> {
3535 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3536 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3540 template<
int R0,
int R1>
3541 struct F128PermuteHelper2<R0, R1, 1, 0> {
3542 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3543 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3547 template<
int R0,
int R1>
3548 struct F128PermuteHelper2<R0, R1, 1, 1> {
3549 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3550 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3554 template<
int R0,
int R1>
3555 struct F128PermuteHelper2<R0, R1, 1, 2> {
3556 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3557 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3561 template<
int R0,
int R1>
3562 struct F128PermuteHelper2<R0, R1, 1, 3> {
3563 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3564 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3568 template<
int R0,
int R1>
3569 struct F128PermuteHelper2<R0, R1, 2, 0> {
3570 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3571 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3575 template<
int R0,
int R1>
3576 struct F128PermuteHelper2<R0, R1, 2, 1> {
3577 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3578 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3582 template<
int R0,
int R1>
3583 struct F128PermuteHelper2<R0, R1, 2, 2> {
3584 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3585 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3589 template<
int R0,
int R1>
3590 struct F128PermuteHelper2<R0, R1, 2, 3> {
3591 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3592 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3596 template<
int R0,
int R1>
3597 struct F128PermuteHelper2<R0, R1, 3, 0> {
3598 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3599 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3603 template<
int R0,
int R1>
3604 struct F128PermuteHelper2<R0, R1, 3, 1> {
3605 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3606 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3610 template<
int R0,
int R1>
3611 struct F128PermuteHelper2<R0, R1, 3, 2> {
3612 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3613 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3617 template<
int R0,
int R1>
3618 struct F128PermuteHelper2<R0, R1, 3, 3> {
3619 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3620 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3624 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3625 struct F128PermuteHelper {
3626 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3628 f32x2 x0 = a.vec.ps[0];
3629 f32x2 x1 = a.vec.ps[1];
3630 f32x2 x2 = b.vec.ps[0];
3631 f32x2 x3 = b.vec.ps[1];
3632 ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3633 ::Permute(x0, x1, x2, x3);
3634 ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3635 ::Permute(x0, x1, x2, x3);
3640 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3641 struct F128PermuteHelper {
3642 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3643 f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3644 F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3645 F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3646 F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3652 template <
int V0,
int V1,
int V2,
int V3>
3653 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3654 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3656 return F128::Swizzle<V0, V1, V2, V3>(a);
3660 template <
int V0,
int V1,
int V2,
int V3>
3661 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3662 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3664 return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3668 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3671 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3672 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3673 return _mm_unpacklo_ps(a, b);
3677 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3678 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3679 return _mm_unpacklo_ps(b, a);
3683 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3684 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3685 return _mm_unpackhi_ps(a, b);
3689 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3690 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3691 return _mm_unpackhi_ps(b, a);
3696 template<
int V0,
int V1,
int V2,
int V3>
3697 struct F128PermuteDontCareHelper {
3698 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3703 static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3704 static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3705 return detail::F128PermuteHelper< arg1, arg2,
3706 V0, V1, V2, V3 >::Permute(a, b);
3710 template<
int V1,
int V2,
int V3>
3711 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3712 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3716 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3717 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3721 template<
int V0,
int V2,
int V3>
3722 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3723 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3727 static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3728 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3732 template<
int V0,
int V1,
int V3>
3733 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3734 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3738 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3739 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3743 template<
int V0,
int V1,
int V2>
3744 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3745 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3749 static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3750 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3754 template<
int V2,
int V3>
3755 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3756 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3759 static const int V0 = (V2 < 4) ? 0 : 4;
3760 return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3764 template<
int V1,
int V2>
3765 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3766 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3769 static const int V0 = (V1 & 1) ? V1 - 1: V1;
3770 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3771 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3775 template<
int V0,
int V1>
3776 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3777 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3780 static const int V2 = (V1 < 4) ? 2 : 6;
3781 return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3785 template<
int V0,
int V3>
3786 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3787 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3790 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3791 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3792 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3796 template<
int V0,
int V2>
3797 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3798 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3801 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3802 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3803 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3807 template<
int V1,
int V3>
3808 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3809 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3812 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3813 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3814 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3819 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3820 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3822 static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3823 static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3824 static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3825 return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3830 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3831 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3833 static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3834 static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3835 static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3836 return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3841 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3842 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3844 static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3845 static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3846 static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3847 return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3852 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3853 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3855 static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3856 static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3857 static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3858 return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3863 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3864 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3872 template <
int V0,
int V1,
int V2,
int V3>
3874 NLIB_M(f128) F128::Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3875 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE) 3876 return __builtin_shufflevector(a, b,
3877 (V0 != 8 ? V0 : -1),
3878 (V1 != 8 ? V1 : -1),
3879 (V2 != 8 ? V2 : -1),
3880 (V3 != 8 ? V3 : -1));
3882 return detail::F128PermuteDontCareHelper <
3886 V3 != -1 ? V3 : 8>::Permute(a, b);
3890 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
3893 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat)
NLIB_NOEXCEPT {
3894 #if defined(NLIB_NEON) 3895 const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3896 const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3897 const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3898 const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3901 const int v0 = SplatLane0 ? 4 : 0;
3902 const int v1 = SplatLane1 ? 5 : 1;
3903 const int v2 = SplatLane2 ? 6 : 2;
3904 const int v3 = SplatLane3 ? 7 : 3;
3906 return F128::Permute<v0, v1, v2, v3>(value, splat);
3910 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 3912 ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3913 ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3914 ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3915 ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3918 i128 iround = F128::ConvertToI128Round(value);
3919 f128 fround = F128::ConvertFromI128(iround);
3920 f128 x = F128::Sub(value, fround);
3921 f128 xx = F128::Mult(x, x);
3923 f128 P = F128::LoadA16(F128::exp2_P_);
3924 f128 Q = F128::LoadA16(F128::exp2_Q_);
3932 px = F128::MultAdd(px, xx, F128::SetValue<2>(P,
each_select32));
3933 px = F128::Mult(x, px);
3939 qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q,
each_select32));
3941 x = F128::Div(px, F128::Sub(qx, px));
3945 iround = I128::Add32(iround, I128::SetValue(127,
each_int32));
3946 iround = I128::ShiftLeftLogical32(iround, 23);
3947 x = F128::Mult(x, F128::CastFromI128(iround));
3956 static const float log2e = 1.44269504088896340736f;
3957 return Exp2(F128::Mult(log2e, value));
3961 static const float log2e = 1.44269504088896340736f;
3962 f128 negOne = F128::SetValue(-1.f, each_float);
3963 f128 v0 = F128::MultAdd(log2e, value, negOne);
3964 f128 v1 = F128::MultSub(log2e, value, negOne);
3967 return F128::Sub(e0, e1);
3971 static const float log2e = 1.44269504088896340736f;
3972 f128 negOne = F128::SetValue(-1.f, each_float);
3973 f128 v0 = F128::MultAdd(log2e, value, negOne);
3974 f128 v1 = F128::MultSub(log2e, value, negOne);
3977 return F128::Add(e0, e1);
3982 f128 cvalue = F128::LoadA16(tanh_cvalue_);
3986 e = F128::MultAdd(half, e, half);
3988 return F128::Sub(F128::SetValue<1>(cvalue,
each_select32), e);
3992 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 3994 ret.vec.v[0] = tanf(value.vec.v[0]);
3995 ret.vec.v[1] = tanf(value.vec.v[1]);
3996 ret.vec.v[2] = tanf(value.vec.v[2]);
3997 ret.vec.v[3] = tanf(value.vec.v[3]);
4001 f128 C = F128::LoadA16(&F128::tan_c_[0]);
4004 f128 g = F128::Round(F128::Mult<0>(C, value,
each_select32));
4007 i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U,
each_uint32));
4008 i128 cmp = I128::CmpEq32(t0, I128::SetZero());
4009 nearXAxis = F128::CastFromI128(cmp);
4016 f128 nearAxis = F128::CmpNearEqZero(f, F128::SetValue<3>(C,
each_select32));
4018 f128 P = F128::LoadA16(&F128::tan_p_[0]);
4019 f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4021 f128 ff = F128::Mult(f, f);
4025 p = F128::MultAdd(p, ff, F128::SetValue<0>(P,
each_select32));
4026 p = F128::MultAdd(p, ff, one);
4027 p = F128::Mult(f, p);
4030 q = F128::MultAdd(q, ff, F128::SetValue<1>(Q,
each_select32));
4031 q = F128::MultAdd(q, ff, F128::SetValue<0>(Q,
each_select32));
4032 q = F128::MultAdd(q, ff, one);
4034 p = F128::Select(nearAxis, f, p);
4035 q = F128::Select(nearAxis, one, q);
4037 f128 r0 = F128::Div(p, q);
4038 f128 r1 = F128::Negate(F128::Recp(r0));
4040 return F128::Select(nearXAxis, r0, r1);
4045 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 4046 static const float scale = 1.4426950408889634f;
4048 ret.vec.v[0] = logf(value.vec.v[0]);
4049 ret.vec.v[1] = logf(value.vec.v[1]);
4050 ret.vec.v[2] = logf(value.vec.v[2]);
4051 ret.vec.v[3] = logf(value.vec.v[3]);
4052 return F128::Mult(scale, ret);
4055 f128 x = F128::And(F128::SetValue(0x807FFFFFU,
each_uint32), value);
4056 x = F128::Or(F128::SetValue(127U << 23,
each_uint32), x);
4057 i128 e = I128::And(I128::SetValue(0x7F800000U,
each_uint32), F128::CastToI128(value));
4058 e = I128::ShiftRightLogical32(e, 23);
4059 e = I128::Sub32(e, I128::SetValue(127U,
each_uint32));
4061 x = F128::Sub(x, F128::SetOne());
4062 f128 z = F128::Mult(x, x);
4065 f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4066 f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4067 f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4070 p = F128::MultAdd(p, x, F128::SetValue<1>(pq0,
each_select32));
4071 p = F128::MultAdd(p, x, F128::SetValue<2>(pq0,
each_select32));
4072 p = F128::MultAdd(p, x, F128::SetValue<3>(pq0,
each_select32));
4073 p = F128::MultAdd(p, x, F128::SetValue<0>(pq1,
each_select32));
4074 p = F128::MultAdd(p, x, F128::SetValue<1>(pq1,
each_select32));
4076 f128 q = F128::Add(x, F128::SetValue<2>(pq1,
each_select32));
4077 q = F128::MultAdd(q, x, F128::SetValue<3>(pq1,
each_select32));
4078 q = F128::MultAdd(q, x, F128::SetValue<0>(pq2,
each_select32));
4079 q = F128::MultAdd(q, x, F128::SetValue<1>(pq2,
each_select32));
4080 q = F128::MultAdd(q, x, F128::SetValue<2>(pq2,
each_select32));
4082 y = F128::Mult(z, p);
4083 y = F128::Div(y, q);
4084 y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4090 result = F128::Mult(y, log2ea);
4091 result = F128::MultAdd(log2ea, x, result);
4092 result = F128::Add(result, y);
4093 result = F128::Add(result, x);
4094 result = F128::Add(result, F128::ConvertFromI128(e));
4098 f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4101 f128 is_nan = F128::IsNaN(value);
4103 result = F128::Select(is_nan, nan, result);
4105 f128 is_inf = F128::IsInfinite(value);
4106 f128 is_pos = F128::CmpGtZero(value);
4110 f128 is_pos_inf = F128::And(is_inf, is_pos);
4111 result = F128::Select(is_pos_inf, inf, result);
4115 f128 is_zero = F128::CmpEqZero(value);
4116 result = F128::Select(is_zero, neg_inf, result);
4120 f128 is_neg = F128::CmpLtZero(value);
4121 result = F128::Select(is_neg, neg_nan, result);
4131 #ifdef NLIB_F128_SIMD_NOUSE 4133 ret.vec.v[0] = logf(value.vec.v[0]);
4134 ret.vec.v[1] = logf(value.vec.v[1]);
4135 ret.vec.v[2] = logf(value.vec.v[2]);
4136 ret.vec.v[3] = logf(value.vec.v[3]);
4139 f128 x = F128::Log2(value);
4140 static const float recp_log2e = 0.6931471805597018f;
4141 return F128::Mult(recp_log2e, x);
4147 #endif // NLIB_DOXYGEN 4158 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4171 SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
float m12,
4172 float m13,
float m20,
float m21,
float m22,
float m23,
float m30,
float m31,
4180 inline SimdMatrix::SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
4181 float m12,
float m13,
float m20,
float m21,
float m22,
float m23,
4183 r[0] = F128::SetValue(m00, m01, m02, m03);
4184 r[1] = F128::SetValue(m10, m11, m12, m13);
4185 r[2] = F128::SetValue(m20, m21, m22, m23);
4186 r[3] = F128::SetValue(m30, m31, m32, m33);
4190 uintptr_t algn =
reinterpret_cast<uintptr_t
>(p) & 15;
4191 NLIB_ASSERT((algn & 3) == 0);
4192 switch (algn >> 2) {
4194 r[0] = F128::LoadA16(p);
4195 r[1] = F128::LoadA16(p + 4);
4196 r[2] = F128::LoadA16(p + 8);
4197 r[3] = F128::LoadA16(p + 12);
4200 r[0] = F128::LoadA4(p);
4201 r[1] = F128::LoadA4(p + 4);
4202 r[2] = F128::LoadA4(p + 8);
4203 r[3] = F128::LoadA4(p + 12);
4206 r[0] = F128::LoadA8(p);
4207 r[1] = F128::LoadA8(p + 4);
4208 r[2] = F128::LoadA8(p + 8);
4209 r[3] = F128::LoadA8(p + 12);
4217 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 4223 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE) 4224 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4226 f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \ 4227 f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \ 4228 f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \ 4229 f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \ 4230 row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \ 4231 row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \ 4232 row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \ 4233 row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \ 4235 #elif defined(NLIB_NEON) 4237 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4239 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4240 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4241 uint64x2_t row0_, row1_, row2_, row3_; \ 4242 row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4243 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4244 row0 = vreinterpretq_f32_u64(row0_); \ 4245 row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4246 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4247 row1 = vreinterpretq_f32_u64(row1_); \ 4248 row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4249 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4250 row2 = vreinterpretq_f32_u64(row2_); \ 4251 row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4252 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4253 row3 = vreinterpretq_f32_u64(row3_); \ 4256 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4258 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4259 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4260 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \ 4261 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \ 4262 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \ 4263 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \ 4267 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4270 tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \ 4271 tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \ 4272 row0.vec.ps[0] = tmp0; \ 4273 row1.vec.ps[0] = tmp1; \ 4274 tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \ 4275 tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \ 4276 row2.vec.ps[1] = tmp0; \ 4277 row3.vec.ps[1] = tmp1; \ 4278 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4279 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4280 row0.vec.ps[1] = row2.vec.ps[0]; \ 4281 row1.vec.ps[1] = row3.vec.ps[0]; \ 4282 row2.vec.ps[0] = tmp0; \ 4283 row3.vec.ps[0] = tmp1; \ 4284 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4285 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4286 row0.vec.ps[1] = tmp0; \ 4287 row1.vec.ps[1] = tmp1; \ 4312 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4320 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4328 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4339 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ float x
The x-coordinate of the 3D vector.
SimdMatrix()
Instantiates the object with default parameters (default constructor).
The class with the collection of functions that handle 4x4 matrices.
Class representing the view frustum.
The class with the collection of functions that handle quaternions.
f128arg SimdVectorArg
f128arg is defined using typedef.
The type for two SIMD registers for 128-bit, single-precision, floating-point numbers.
float x
The x-coordinate for the 4D vector.
float y
The y-coordinate of the 4D vector.
Implements the class and functions for SIMD computations on integers.
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
Sets up the matrix from the parameters.
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
f128arg SimdSphereArg
f128arg is defined using typedef.
Class for representing oriented bounding boxes (OBB). This class has data members to hold the center ...
static f128 ShiftRight(f128arg a, f128arg b) noexcept
Sets the elements of b in shifted order to the portion of a that becomes empty when a is shifted to t...
The tag for representing a single-precision floating-point number with an empty structure.
The class with the collection of functions that determine containment relations.
The class with the collection of static member functions that handle spheres in three-dimensional spa...
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
f128arg SimdQuaternionArg
f128arg is defined using typedef.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
The class with the collection of functions that handle planes in three-dimensional space...
f128arg SimdPlaneArg
f128arg is defined using typedef.
The class with the collection of functions that perform calculations on three-dimensional vectors...
static f128 RotateLeft(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the left by the amount of N...
The class with the collection of functions that perform square-of-distance calculations.
The type for reading and writing four-dimensional vectors in memory. Keeps float-type x...
const f128 f128arg
const f128 or const f128& is defined using typedef.
The structure for keeping a 4x4 matrix.
float z
The z-coordinate of the 4D vector.
nlib_f128x2_t f128x2
nlib_f128x2_t is defined using typedef.
f128 SimdSphere
f128 is defined using typedef. Used when handling spheres.
The class for single-precision floating point SIMD computations using128-bit registers (MM0-XMM15 for...
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
A file that contains the configuration information for each development environment.
The class with the collection of functions that perform calculations on four-dimensional vectors...
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
The tag for representing an unsigned 32-bit integer with an empty structure.
static f128 RotateRight(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the right by the amount of N...
float y
The y-coordinate of the 3D vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
float z
The z-coordinate of the 3D vector.
Class for representing axis-aligned bounding boxes (AABB). The class has data members to hold the min...
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
float w
The w-coordinate of the 4D vector.
The class with the collection of functions that determine intersections.
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
__m128 nlib_f128_t
The type for a SIMD register for 128-bit, single-precision, floating-point numbers.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...