3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 4 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 7 # ifndef __USE_C99_MATH 8 # define __USE_C99_MATH 18 #define INFINITY ((float)(1e+300 * 1e+300)) 21 #if !defined(NLIB_SIMD) && !defined(CAFE) 22 #define NLIB_F128_SIMD_NOUSE 25 #ifdef NLIB_F128_SIMD_NOUSE 33 #elif defined(NLIB_SSE41) 36 #elif defined(NLIB_NEON) 62 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 68 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 69 typedef const f128& f128arg_ex;
71 typedef const f128 f128arg_ex;
74 #if !defined(_MSC_VER) || _MSC_VER < 1800 84 static f128 __vectorcall SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT;
93 static f128 __vectorcall SetZeroToLane(f128arg value)
NLIB_NOEXCEPT;
101 static f128 __vectorcall LoadA16(
const float* p)
NLIB_NOEXCEPT;
102 static f128 __vectorcall LoadA8(
const float* p)
NLIB_NOEXCEPT;
103 static f128 __vectorcall LoadA4(
const float* p)
NLIB_NOEXCEPT;
111 static void __vectorcall StoreA16(
float* p, f128arg value)
NLIB_NOEXCEPT;
112 static void __vectorcall StoreA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
113 static void __vectorcall StoreA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
114 static void __vectorcall StoreA16(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
115 static void __vectorcall StoreA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
116 static void __vectorcall StoreA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
117 static void __vectorcall StoreA16(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
118 static void __vectorcall StoreA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
119 static void __vectorcall StoreA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
121 static void __vectorcall StoreLoA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
122 static void __vectorcall StoreLoA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
123 static void __vectorcall StoreLoA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
124 static void __vectorcall StoreLoA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
125 static void __vectorcall StoreLoA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
126 static void __vectorcall StoreLoA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
128 static void __vectorcall StoreHiA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
129 static void __vectorcall StoreHiA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
130 static void __vectorcall StoreHiA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
131 static void __vectorcall StoreHiA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
132 static void __vectorcall StoreHiA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
133 static void __vectorcall StoreHiA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
141 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 150 static f128 __vectorcall ConvertFromFixedPoint(i128arg value)
NLIB_NOEXCEPT;
155 static f128 __vectorcall Add(f128arg a, f128arg b)
NLIB_NOEXCEPT;
156 static f128 __vectorcall Sub(f128arg a, f128arg b)
NLIB_NOEXCEPT;
157 static f128 __vectorcall Mult(f128arg a, f128arg b)
NLIB_NOEXCEPT;
158 static f128 __vectorcall Mult(
float a, f128arg b)
NLIB_NOEXCEPT;
161 static f128 __vectorcall Div(f128arg a, f128arg b)
NLIB_NOEXCEPT;
162 static f128 __vectorcall Negate(f128arg value)
NLIB_NOEXCEPT;
163 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
164 static f128 __vectorcall NegateEx(f128arg value)
NLIB_NOEXCEPT;
165 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
166 static f128 __vectorcall MultAdd(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
168 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
170 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
171 static f128 __vectorcall MultSub(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
173 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
175 static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b)
NLIB_NOEXCEPT;
177 static f128 __vectorcall AbsDiff(f128arg a, f128arg b)
NLIB_NOEXCEPT;
183 static f128 __vectorcall Max(f128arg a, f128arg b)
NLIB_NOEXCEPT;
184 static f128 __vectorcall Min(f128arg a, f128arg b)
NLIB_NOEXCEPT;
185 static f128 __vectorcall PairwiseMax(f128arg a, f128arg b)
NLIB_NOEXCEPT;
186 static f128 __vectorcall PairwiseMin(f128arg a, f128arg b)
NLIB_NOEXCEPT;
187 static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max)
NLIB_NOEXCEPT;
188 static f128 __vectorcall Saturate(f128arg value)
NLIB_NOEXCEPT;
195 static f128 __vectorcall RecpEst(f128arg value)
NLIB_NOEXCEPT;
197 static f128 __vectorcall SqrtEst(f128arg value)
NLIB_NOEXCEPT;
198 static f128 __vectorcall RecpSqrt(f128arg value)
NLIB_NOEXCEPT;
199 static f128 __vectorcall RecpSqrtEst(f128arg value)
NLIB_NOEXCEPT;
206 static f128 __vectorcall Truncate(f128arg value)
NLIB_NOEXCEPT;
214 static f128 __vectorcall And(f128arg a, f128arg b)
NLIB_NOEXCEPT;
215 static f128 __vectorcall Or(f128arg a, f128arg b)
NLIB_NOEXCEPT;
216 static f128 __vectorcall Xor(f128arg a, f128arg b)
NLIB_NOEXCEPT;
218 static f128 __vectorcall AndNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
219 static f128 __vectorcall OrNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
225 static f128 __vectorcall CmpEq(f128arg a, f128arg b)
NLIB_NOEXCEPT;
226 static f128 __vectorcall CmpLt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
227 static f128 __vectorcall CmpLe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
228 static f128 __vectorcall CmpGt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
229 static f128 __vectorcall CmpGe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
230 static f128 __vectorcall CmpNe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
231 static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps)
NLIB_NOEXCEPT;
232 static f128 __vectorcall InBound(f128arg value, f128arg bounds)
NLIB_NOEXCEPT;
234 static f128 __vectorcall CmpEqZero(f128arg value)
NLIB_NOEXCEPT;
235 static f128 __vectorcall CmpLtZero(f128arg value)
NLIB_NOEXCEPT;
236 static f128 __vectorcall CmpLeZero(f128arg value)
NLIB_NOEXCEPT;
237 static f128 __vectorcall CmpGtZero(f128arg value)
NLIB_NOEXCEPT;
238 static f128 __vectorcall CmpGeZero(f128arg value)
NLIB_NOEXCEPT;
239 static f128 __vectorcall CmpNeZero(f128arg value)
NLIB_NOEXCEPT;
240 static f128 __vectorcall CmpNearEqZero(f128arg value, f128arg eps)
NLIB_NOEXCEPT;
245 static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
246 static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
247 static f128 __vectorcall ModAngle(f128arg value)
NLIB_NOEXCEPT;
250 static f128x2 __vectorcall SinCos(f128arg value)
NLIB_NOEXCEPT;
255 static f128 __vectorcall ArcSin(f128arg value)
NLIB_NOEXCEPT;
256 static f128 __vectorcall ArcCos(f128arg value)
NLIB_NOEXCEPT;
257 static f128 __vectorcall ArcTan(f128arg value)
NLIB_NOEXCEPT;
258 static f128 __vectorcall ArcTan2(f128arg y, f128arg x)
NLIB_NOEXCEPT;
265 static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t)
NLIB_NOEXCEPT;
266 static f128 __vectorcall
267 Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t)
NLIB_NOEXCEPT;
268 static f128 __vectorcall
269 CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t)
NLIB_NOEXCEPT;
270 static f128 __vectorcall
271 BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g)
NLIB_NOEXCEPT;
285 static int __vectorcall MoveMask(f128arg value)
NLIB_NOEXCEPT;
286 static bool __vectorcall IsAllMaskFalse(f128arg value)
NLIB_NOEXCEPT;
287 static bool __vectorcall IsAllMaskTrue(f128arg value)
NLIB_NOEXCEPT;
288 static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b)
NLIB_NOEXCEPT;
290 static f128 __vectorcall IsInfinite(f128arg value)
NLIB_NOEXCEPT;
297 static float __vectorcall GetFloatFromLane(f128arg value)
NLIB_NOEXCEPT;
299 static uint32_t __vectorcall GetUint32FromLane(f128arg value)
NLIB_NOEXCEPT;
300 static float __vectorcall GetFloatByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
301 static uint32_t __vectorcall GetUint32ByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
304 static f128 __vectorcall SetFloatToLane(f128arg value,
float v)
NLIB_NOEXCEPT;
305 static f128 __vectorcall SetFloatByIndex(f128arg value,
float v,
size_t i)
NLIB_NOEXCEPT;
311 template <
int V0,
int V1,
int V2,
int V3>
312 static f128 __vectorcall Swizzle(f128arg value)
NLIB_NOEXCEPT;
313 template <
int V0,
int V1,
int V2,
int V3>
314 static f128 __vectorcall Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT;
315 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
316 static f128 __vectorcall Splat(f128arg value, f128arg splat)
NLIB_NOEXCEPT;
320 static f128 __vectorcall
RotateLeft(f128arg value) NLIB_NOEXCEPT {
322 const size_t NN = 4 - N;
323 return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
327 static f128 __vectorcall
RotateRight(f128arg value) NLIB_NOEXCEPT {
329 return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
333 static f128 __vectorcall
ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
335 return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
389 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 390 #define NLIB_M2(tp) inline tp __vectorcall 394 #ifdef NLIB_F128_SIMD_NOUSE 401 #elif defined(NLIB_SSE41) 402 return _mm_set1_ps(v);
403 #elif defined(NLIB_NEON) 404 return vdupq_n_f32(v);
407 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
414 #ifdef NLIB_F128_SIMD_NOUSE 421 #elif defined(NLIB_SSE41) 427 return _mm_set1_ps(tmp.f32);
428 #elif defined(NLIB_NEON) 429 uint32x4_t tmp = vdupq_n_u32(v);
430 return vreinterpretq_f32_u32(tmp);
438 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
444 NLIB_M(f128) F128::SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT {
445 #ifdef NLIB_F128_SIMD_NOUSE 452 #elif defined(NLIB_SSE41) 453 return _mm_set_ps(d, c, b, a);
454 #elif defined(NLIB_NEON) 463 return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
466 ret.vec.ps[0][0] = a;
467 ret.vec.ps[0][1] = b;
468 ret.vec.ps[1][0] = c;
469 ret.vec.ps[1][1] = d;
478 #ifdef NLIB_F128_SIMD_NOUSE 480 ret.vec.v[0] = value.vec.v[N];
481 ret.vec.v[1] = value.vec.v[N];
482 ret.vec.v[2] = value.vec.v[N];
483 ret.vec.v[3] = value.vec.v[N];
485 #elif defined(NLIB_SSE41) 486 return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
487 #elif defined(NLIB_NEON) 488 float32x2_t tmp = vget_low_f32(value);
489 return vdupq_lane_f32(tmp, N);
492 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
497 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 500 float32x2_t tmp = vget_high_f32(value);
501 return vdupq_lane_f32(tmp, 0);
505 float32x2_t tmp = vget_high_f32(value);
506 return vdupq_lane_f32(tmp, 1);
508 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 512 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
518 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
524 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
530 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
537 #ifdef NLIB_F128_SIMD_NOUSE 544 #elif defined(NLIB_SSE41) 545 return _mm_setzero_ps();
546 #elif defined(NLIB_NEON) 547 return vdupq_n_f32(0);
550 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
556 #ifdef NLIB_F128_SIMD_NOUSE 563 #elif defined(NLIB_NEON) 564 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
565 float32x2_t x00 = vcreate_f32(0ULL);
566 return vcombine_f32(x10, x00);
568 return F128::LoadA16(F128::v1000_);
573 #ifdef NLIB_F128_SIMD_NOUSE 580 #elif defined(NLIB_NEON) 581 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
582 float32x2_t x00 = vcreate_f32(0ULL);
583 return vcombine_f32(x01, x00);
585 return F128::LoadA16(F128::v0100_);
590 #ifdef NLIB_F128_SIMD_NOUSE 597 #elif defined(NLIB_NEON) 598 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
599 float32x2_t x00 = vcreate_f32(0ULL);
600 return vcombine_f32(x00, x10);
602 return F128::LoadA16(F128::v0010_);
607 #ifdef NLIB_F128_SIMD_NOUSE 614 #elif defined(NLIB_NEON) 615 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
616 float32x2_t x00 = vcreate_f32(0ULL);
617 return vcombine_f32(x00, x01);
619 return F128::LoadA16(F128::v0001_);
625 NLIB_M(f128) F128::SetZeroToLane(f128arg value)
NLIB_NOEXCEPT {
627 #ifdef NLIB_F128_SIMD_NOUSE 631 #elif defined(NLIB_SSE41) 632 return _mm_insert_ps(value, value, 1 << N);
633 #elif defined(NLIB_NEON) 634 return F128::Permute<N == 0 ? 4 : 0,
637 N == 3 ? 7 : 3>(value, vdupq_n_f32(0.f));
641 ret.vec.ps[N / 2][N % 2] = 0.f;
648 return F128::SetValue(1.f, each_float);
653 return F128::SetValue(-1.f, each_float);
658 return F128::SetValue(1.0e-7f, each_float);
673 return F128::SetValue(-0.f, each_float);
678 #ifdef NLIB_F128_SIMD_NOUSE 685 #elif defined(NLIB_SSE41) 686 return _mm_load_ps(p);
687 #elif defined(NLIB_NEON) 688 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
689 uint64x2_t val = vld1q_u64(tmp);
690 return vreinterpretq_f32_u64(val);
693 ret.vec.ps[0][0] = p[0];
694 ret.vec.ps[0][1] = p[1];
695 ret.vec.ps[1][0] = p[2];
696 ret.vec.ps[1][1] = p[3];
703 #ifdef NLIB_F128_SIMD_NOUSE 705 #elif defined(NLIB_SSE41) 706 return _mm_loadu_ps(p);
707 #elif defined(NLIB_NEON) 711 ret.vec.ps[0][0] = p[0];
712 ret.vec.ps[0][1] = p[1];
713 ret.vec.ps[1][0] = p[2];
714 ret.vec.ps[1][1] = p[3];
721 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 722 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
723 uint64x2_t val = vld1q_u64(tmp);
724 return vreinterpretq_f32_u64(val);
732 return LoadA16(reinterpret_cast<const float*>(p));
737 return LoadA8(reinterpret_cast<const float*>(p));
742 return LoadA4(reinterpret_cast<const float*>(p));
747 return LoadA16(reinterpret_cast<const float*>(p));
752 return LoadA8(reinterpret_cast<const float*>(p));
757 return LoadA4(reinterpret_cast<const float*>(p));
761 NLIB_M(
void) F128::StoreA16(
float* p, f128arg value)
NLIB_NOEXCEPT {
762 #ifdef NLIB_F128_SIMD_NOUSE 763 p[0] = value.vec.v[0];
764 p[1] = value.vec.v[1];
765 p[2] = value.vec.v[2];
766 p[3] = value.vec.v[3];
767 #elif defined(NLIB_SSE41) 768 _mm_store_ps(p, value);
769 #elif defined(NLIB_NEON) 770 uint64x2_t tmp = vreinterpretq_u64_f32(value);
771 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
773 p[0] = value.vec.ps[0][0];
774 p[1] = value.vec.ps[0][1];
775 p[2] = value.vec.ps[1][0];
776 p[3] = value.vec.ps[1][1];
781 NLIB_M(
void) F128::StoreA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
782 #ifdef NLIB_F128_SIMD_NOUSE 784 #elif defined(NLIB_SSE41) 785 _mm_storeu_ps(p, value);
786 #elif defined(NLIB_NEON) 789 p[0] = value.vec.ps[0][0];
790 p[1] = value.vec.ps[0][1];
791 p[2] = value.vec.ps[1][0];
792 p[3] = value.vec.ps[1][1];
797 NLIB_M(
void) F128::StoreA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
798 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 799 uint64x2_t tmp = vreinterpretq_u64_f32(value);
800 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
807 NLIB_M(
void) F128::StoreA16(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
808 StoreA16(reinterpret_cast<float*>(p), value);
812 NLIB_M(
void) F128::StoreA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
813 StoreA8(reinterpret_cast<float*>(p), value);
817 NLIB_M(
void) F128::StoreA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
818 StoreA4(reinterpret_cast<float*>(p), value);
822 NLIB_M(
void) F128::StoreA16(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
823 StoreA16(reinterpret_cast<float*>(p), value);
827 NLIB_M(
void) F128::StoreA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
828 StoreA8(reinterpret_cast<float*>(p), value);
832 NLIB_M(
void) F128::StoreA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
833 StoreA4(reinterpret_cast<float*>(p), value);
837 NLIB_M(
void) F128::StoreLoA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
838 #ifdef NLIB_F128_SIMD_NOUSE 839 p[0] = value.vec.v[0];
840 p[1] = value.vec.v[1];
841 #elif defined(NLIB_SSE41) 842 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
843 #elif defined(NLIB_NEON) 844 uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
845 vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
847 p[0] = value.vec.ps[0][0];
848 p[1] = value.vec.ps[0][1];
853 NLIB_M(
void) F128::StoreLoA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
854 #ifdef NLIB_F128_SIMD_NOUSE 855 p[0] = value.vec.v[0];
856 p[1] = value.vec.v[1];
857 #elif defined(NLIB_SSE41) 858 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
859 #elif defined(NLIB_NEON) 860 float32x2_t tmp = vget_low_f32(value);
863 p[0] = value.vec.ps[0][0];
864 p[1] = value.vec.ps[0][1];
869 NLIB_M(
void) F128::StoreLoA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
870 StoreLoA8(reinterpret_cast<float*>(p), value);
874 NLIB_M(
void) F128::StoreLoA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
875 StoreLoA4(reinterpret_cast<float*>(p), value);
879 NLIB_M(
void) F128::StoreLoA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
880 StoreLoA8(reinterpret_cast<float*>(p), value);
884 NLIB_M(
void) F128::StoreLoA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
885 StoreLoA4(reinterpret_cast<float*>(p), value);
889 NLIB_M(
void) F128::StoreHiA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
890 #ifdef NLIB_F128_SIMD_NOUSE 891 p[0] = value.vec.v[2];
892 p[1] = value.vec.v[3];
893 #elif defined(NLIB_SSE41) 894 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
895 #elif defined(NLIB_NEON) 896 vst1_f32(p, vget_high_f32(value));
898 p[0] = value.vec.ps[1][0];
899 p[1] = value.vec.ps[1][1];
904 NLIB_M(
void) F128::StoreHiA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
905 #ifdef NLIB_F128_SIMD_NOUSE 906 p[0] = value.vec.v[2];
907 p[1] = value.vec.v[3];
908 #elif defined(NLIB_SSE41) 909 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
910 #elif defined(NLIB_NEON) 911 float32x2_t tmp = vget_high_f32(value);
914 p[0] = value.vec.ps[1][0];
915 p[1] = value.vec.ps[1][1];
920 NLIB_M(
void) F128::StoreHiA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
921 StoreHiA8(reinterpret_cast<float*>(p), value);
925 NLIB_M(
void) F128::StoreHiA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
926 StoreHiA4(reinterpret_cast<float*>(p), value);
930 NLIB_M(
void) F128::StoreHiA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
931 StoreHiA8(reinterpret_cast<float*>(p), value);
935 NLIB_M(
void) F128::StoreHiA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
936 StoreHiA4(reinterpret_cast<float*>(p), value);
941 #ifdef NLIB_F128_SIMD_NOUSE 943 ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
944 ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
945 ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
946 ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
948 #elif defined(NLIB_NEON) 949 return vabsq_f32(value);
950 #elif defined(NLIB_SSE41) 951 const __m128 signmask = _mm_set1_ps(-0.0f);
952 return _mm_andnot_ps(signmask, value);
955 ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
956 ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
962 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b)
NLIB_NOEXCEPT {
963 #ifdef NLIB_F128_SIMD_NOUSE 965 result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
966 result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
967 result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
968 result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
970 #elif defined(NLIB_SSE41) 971 return _mm_blendv_ps(b, a, mask);
972 #elif defined(NLIB_NEON) 973 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
977 mask_.vec.u[0] &= 0xFF7FFFFFUL;
978 mask_.vec.u[1] &= 0xFF7FFFFFUL;
979 mask_.vec.u[2] &= 0xFF7FFFFFUL;
980 mask_.vec.u[3] &= 0xFF7FFFFFUL;
983 ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
984 ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
1036 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 1039 #if defined(NLIB_SSE41) 1040 return _mm_cvtepi32_ps(value);
1041 #elif defined(NLIB_NEON) 1042 return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1048 #if defined(NLIB_SSE41) 1049 return _mm_castsi128_ps(value);
1050 #elif defined(NLIB_NEON) 1051 return vreinterpretq_f32_s8(value);
1057 #if defined(NLIB_SSE41) 1058 return _mm_cvtps_epi32(value);
1059 #elif defined(NLIB_NEON) 1060 uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1061 uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1062 uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1063 w = vorrq_u32(w, half);
1064 return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1069 #if defined(NLIB_SSE41) 1070 return _mm_cvttps_epi32(value);
1071 #elif defined(NLIB_NEON) 1072 return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1078 #if defined(NLIB_SSE41) 1079 return _mm_castps_si128(value);
1080 #elif defined(NLIB_NEON) 1081 return vreinterpretq_s8_f32(value);
1086 NLIB_M(f128) F128::ConvertFromFixedPoint(i128arg value)
NLIB_NOEXCEPT {
1088 #if defined(NLIB_NEON) 1089 return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1091 f128 f = F128::ConvertFromI128(value);
1092 f128 m = F128::SetValue(((0x7F - N) << 23),
each_uint32);
1093 return F128::Mult(f, m);
1100 #if defined(NLIB_NEON) 1101 return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1103 f128 m = F128::SetValue(((0x7F + N) << 23),
each_uint32);
1104 f128 f = F128::Mult(value, m);
1105 return F128::ConvertToI128Truncate(f);
1112 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1113 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1115 ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1116 ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1117 ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1118 ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1120 #elif defined(NLIB_SSE41) 1121 return _mm_cmplt_ps(a, b);
1122 #elif defined(NLIB_NEON) 1123 uint32x4_t tmp = vcltq_f32(a, b);
1124 return vreinterpretq_f32_u32(tmp);
1129 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1130 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1132 ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1133 ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1134 ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1135 ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1137 #elif defined(NLIB_SSE41) 1138 return _mm_cmple_ps(a, b);
1139 #elif defined(NLIB_NEON) 1140 uint32x4_t tmp = vcleq_f32(a, b);
1141 return vreinterpretq_f32_u32(tmp);
1146 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1147 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1149 ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1150 ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1151 ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1152 ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1154 #elif defined(NLIB_SSE41) 1155 return _mm_cmpgt_ps(a, b);
1156 #elif defined(NLIB_NEON) 1157 uint32x4_t tmp = vcgtq_f32(a, b);
1158 return vreinterpretq_f32_u32(tmp);
1163 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1164 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1166 ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1167 ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1168 ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1169 ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1171 #elif defined(NLIB_SSE41) 1172 return _mm_cmpge_ps(a, b);
1173 #elif defined(NLIB_NEON) 1174 uint32x4_t tmp = vcgeq_f32(a, b);
1175 return vreinterpretq_f32_u32(tmp);
1180 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1181 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1183 ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1184 ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1185 ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1186 ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1188 #elif defined(NLIB_SSE41) 1189 return _mm_cmpneq_ps(a, b);
1190 #elif defined(NLIB_NEON) 1191 uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1192 return vreinterpretq_f32_u32(tmp);
1197 NLIB_M(f128) F128::Add(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1198 #ifdef NLIB_F128_SIMD_NOUSE 1200 ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1201 ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1202 ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1203 ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1205 #elif defined(NLIB_SSE41) 1206 return _mm_add_ps(a, b);
1207 #elif defined(NLIB_NEON) 1208 return vaddq_f32(a, b);
1211 ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1212 ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1218 NLIB_M(f128) F128::Sub(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1219 #ifdef NLIB_F128_SIMD_NOUSE 1221 ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1222 ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1223 ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1224 ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1226 #elif defined(NLIB_SSE41) 1227 return _mm_sub_ps(a, b);
1228 #elif defined(NLIB_NEON) 1229 return vsubq_f32(a, b);
1232 ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1233 ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1240 #ifdef NLIB_F128_SIMD_NOUSE 1242 ret.vec.v[0] = -value.vec.v[0];
1243 ret.vec.v[1] = -value.vec.v[1];
1244 ret.vec.v[2] = -value.vec.v[2];
1245 ret.vec.v[3] = -value.vec.v[3];
1247 #elif defined(NLIB_NEON) 1248 return vnegq_f32(value);
1249 #elif defined(NLIB_SSE41) 1250 const __m128 signmask = _mm_set1_ps(-0.0f);
1251 return _mm_xor_ps(signmask, value);
1254 ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1255 ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1261 NLIB_M(f128) F128::Mult(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1262 #ifdef NLIB_F128_SIMD_NOUSE 1264 ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1265 ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1266 ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1267 ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1269 #elif defined(NLIB_SSE41) 1270 return _mm_mul_ps(a, b);
1271 #elif defined(NLIB_NEON) 1272 return vmulq_f32(a, b);
1275 ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1276 ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1283 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1284 return vmulq_n_f32(b, a);
1285 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1287 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1288 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1291 return F128::Mult(b, F128::SetValue(a, each_float));
1298 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1300 return vmulq_laneq_f32(b, a, N);
1302 float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1303 return vmulq_n_f32(b, tmp);
1305 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1306 float t = a.vec.ps[N / 2][N % 2];
1308 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1309 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1317 NLIB_M(f128) F128::Div(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1318 #ifdef NLIB_F128_SIMD_NOUSE 1320 ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1321 ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1322 ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1323 ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1325 #elif defined(NLIB_SSE41) 1326 return _mm_div_ps(a, b);
1327 #elif defined(NLIB_NEON) 1329 return vdivq_f32(a, b);
1331 float32x4_t inv0 = vrecpeq_f32(b);
1332 float32x4_t step0 = vrecpsq_f32(inv0, b);
1333 float32x4_t inv1 = vmulq_f32(step0, inv0);
1334 float32x4_t step1 = vrecpsq_f32(inv1, b);
1335 float32x4_t inv2 = vmulq_f32(step1, inv1);
1336 uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1337 inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1338 return vmulq_f32(a, inv2);
1342 ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1343 ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1349 NLIB_M(f128) F128::Max(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1350 #ifdef NLIB_F128_SIMD_NOUSE 1352 ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1353 ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1354 ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1355 ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1357 #elif defined(NLIB_SSE41) 1358 return _mm_max_ps(a, b);
1359 #elif defined(NLIB_NEON) 1360 return vmaxq_f32(a, b);
1362 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1363 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1365 ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1366 ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1372 NLIB_M(f128) F128::Min(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1373 #ifdef NLIB_F128_SIMD_NOUSE 1375 ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1376 ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1377 ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1378 ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1380 #elif defined(NLIB_SSE41) 1381 return _mm_min_ps(a, b);
1382 #elif defined(NLIB_NEON) 1383 return vminq_f32(a, b);
1385 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1386 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1388 ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1389 ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1395 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1396 #ifdef NLIB_F128_SIMD_NOUSE 1398 ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1399 ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1400 ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1401 ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1403 #elif defined(NLIB_SSE41) 1404 f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1405 f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1406 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1407 #elif defined(NLIB_NEON) 1409 return vpmaxq_f32(a, b);
1411 float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1412 float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1413 return vcombine_f32(rl, rh);
1416 f32x2 v02, v13, cmp;
1418 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1419 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1420 cmp = __PS_SUB(v02, v13);
1421 ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1422 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1423 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1424 cmp = __PS_SUB(v02, v13);
1425 ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1431 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1432 #ifdef NLIB_F128_SIMD_NOUSE 1434 ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1435 ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1436 ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1437 ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1439 #elif defined(NLIB_SSE41) 1440 f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1441 f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1442 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1443 #elif defined(NLIB_NEON) 1445 return vpminq_f32(a, b);
1447 float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1448 float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1449 return vcombine_f32(rl, rh);
1452 f32x2 v02, v13, cmp;
1454 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1455 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1456 cmp = __PS_SUB(v02, v13);
1457 ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1458 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1459 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1460 cmp = __PS_SUB(v02, v13);
1461 ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1467 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1468 #ifdef NLIB_F128_SIMD_NOUSE 1470 ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1471 ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1472 ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1473 ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1475 #elif defined(NLIB_SSE41) 1476 return _mm_hadd_ps(a, b);
1477 #elif defined(NLIB_NEON) 1479 return vpaddq_f32(a, b);
1481 float32x2_t al = vget_low_f32(a);
1482 float32x2_t ah = vget_high_f32(a);
1483 float32x2_t l = vpadd_f32(al, ah);
1485 float32x2_t bl = vget_low_f32(b);
1486 float32x2_t bh = vget_high_f32(b);
1487 float32x2_t h = vpadd_f32(bl, bh);
1488 return vcombine_f32(l, h);
1491 f32x2 v02, v13, cmp;
1493 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1494 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1495 ret.vec.ps[0] = __PS_ADD(v02, v13);
1496 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1497 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1498 ret.vec.ps[1] = __PS_ADD(v02, v13);
1504 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1505 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1506 return vabdq_f32(a, b);
1508 return F128::Abs(F128::Sub(a, b));
1513 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1514 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1516 return vfmaq_f32(c, a, b);
1518 return vmlaq_f32(c, a, b);
1520 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1522 ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1523 ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1526 return F128::Add(c, F128::Mult(a, b));
1531 NLIB_M(f128) F128::MultAdd(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1532 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1534 return vfmaq_n_f32(c, b, a);
1536 return vmlaq_n_f32(c, b, a);
1539 return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1545 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1548 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1550 return vfmaq_laneq_f32(c, b, a, N);
1552 return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1555 return F128::MultAdd(F128::SetValue<N>(a,
each_select32), b, c);
1560 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1561 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1563 return vfmsq_f32(c, a, b);
1565 return vmlsq_f32(c, a, b);
1567 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1569 ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1570 ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1573 return F128::Sub(c, F128::Mult(a, b));
1578 NLIB_M(f128) F128::MultSub(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1579 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1581 return vfmsq_n_f32(c, b, a);
1583 return vmlsq_n_f32(c, b, a);
1586 return F128::MultSub(F128::SetValue(a, each_float), b, c);
1592 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1595 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1597 return vfmsq_laneq_f32(c, b, a, N);
1599 return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1602 return F128::MultSub(F128::SetValue<N>(a,
each_select32), b, c);
1607 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t)
NLIB_NOEXCEPT {
1609 return F128::MultAdd(t, F128::Sub(b, a), a);
1613 NLIB_M(f128) F128::And(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1614 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1616 ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1617 ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1618 ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1619 ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1621 #elif defined(NLIB_SSE41) 1622 return _mm_and_ps(a, b);
1623 #elif defined(NLIB_NEON) 1624 uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1625 return vreinterpretq_f32_u32(tmp);
1630 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT {
1633 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1636 f128 sum = F128::Add(angle1, angle2);
1637 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1638 f128 ofs = F128::And(cond, pi_dbl);
1639 f128 result = F128::Add(sum, ofs);
1640 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1641 ofs = F128::And(cond, pi_dbl);
1642 return F128::Sub(result, ofs);
1646 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT {
1649 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1652 f128 sum = F128::Sub(angle1, angle2);
1653 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1654 f128 ofs = F128::And(cond, pi_dbl);
1655 f128 result = F128::Add(sum, ofs);
1656 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1657 ofs = F128::And(cond, pi_dbl);
1658 return F128::Sub(result, ofs);
1665 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1672 f128 tt = F128::Mult(t, t);
1673 f128 ttt = F128::Mult(tt, t);
1674 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1675 f128 hermite_R0 = vcombine_f32(vcreate_f32(0x3F80000040000000ULL),
1676 vcreate_f32(0x3F800000C0000000ULL));
1677 f128 hermite_R1 = vcombine_f32(vcreate_f32(0xC0000000C0400000ULL),
1678 vcreate_f32(0xBF80000040400000ULL));
1680 f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1681 f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1684 ttt = F128::Mult(ttt, hermite_R0);
1685 ttt = F128::MultAdd(tt, hermite_R1, ttt);
1686 ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1687 ttt = F128::Add(ttt, F128::Set1000());
1701 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1703 f128 tt = F128::Mult(t, t);
1704 f128 ttt = F128::Mult(tt, t);
1705 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1706 f128 catmull_R0 = vcombine_f32(vcreate_f32(0x40400000BF800000ULL),
1707 vcreate_f32(0x3F800000C0400000ULL));
1708 f128 catmull_R1 = vcombine_f32(vcreate_f32(0xC0A0000040000000ULL),
1709 vcreate_f32(0xBF80000040800000ULL));
1710 f128 catmull_R2 = vcombine_f32(vcreate_f32(0x00000000BF800000ULL),
1711 vcreate_f32(0x000000003F800000ULL));
1713 f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1714 f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1715 f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1717 ttt = F128::Mult(ttt, catmull_R0);
1718 ttt = F128::MultAdd(tt, catmull_R1, ttt);
1719 ttt = F128::MultAdd(t, catmull_R2, ttt);
1720 ttt = F128::Add(ttt, F128::Set0100());
1732 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1734 f128 p1p0 = F128::Sub(p1, p0);
1735 f128 p2p0 = F128::Sub(p2, p0);
1736 f128 tmp = F128::MultAdd(f, p1p0, p0);
1737 return F128::MultAdd(g, p2p0, tmp);
1742 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1744 ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1745 ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1746 ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1747 ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1749 #elif defined(NLIB_SSE41) 1750 return _mm_or_ps(a, b);
1751 #elif defined(NLIB_NEON) 1752 uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1753 return vreinterpretq_f32_u32(tmp);
1758 NLIB_M(f128) F128::Xor(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1759 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1761 ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1762 ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1763 ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1764 ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1766 #elif defined(NLIB_SSE41) 1767 return _mm_xor_ps(a, b);
1768 #elif defined(NLIB_NEON) 1769 uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1770 return vreinterpretq_f32_u32(tmp);
1776 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1778 ret.vec.u[0] = ~a.vec.u[0];
1779 ret.vec.u[1] = ~a.vec.u[1];
1780 ret.vec.u[2] = ~a.vec.u[2];
1781 ret.vec.u[3] = ~a.vec.u[3];
1783 #elif defined(NLIB_SSE41) 1784 return _mm_andnot_ps(a, F128::CmpEq(a, a));
1785 #elif defined(NLIB_NEON) 1786 uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1787 return vreinterpretq_f32_u32(tmp);
1792 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1793 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1795 ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1796 ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1797 ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1798 ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1800 #elif defined(NLIB_SSE41) 1801 return _mm_andnot_ps(a, b);
1802 #elif defined(NLIB_NEON) 1803 uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1804 return vreinterpretq_f32_u32(tmp);
1809 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1810 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1812 ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1813 ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1814 ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1815 ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1817 #elif defined(NLIB_SSE41) 1818 return _mm_or_ps(F128::Not(a), b);
1819 #elif defined(NLIB_NEON) 1820 uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1821 return vreinterpretq_f32_u32(tmp);
1826 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1827 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1829 ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1830 ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1831 ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1832 ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1834 #elif defined(NLIB_SSE41) 1835 return _mm_cmpeq_ps(a, b);
1836 #elif defined(NLIB_NEON) 1837 uint32x4_t tmp = vceqq_f32(a, b);
1838 return vreinterpretq_f32_u32(tmp);
1843 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps)
NLIB_NOEXCEPT {
1844 f128 tmp = F128::AbsDiff(a, b);
1845 return F128::CmpLe(tmp, eps);
1849 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max)
NLIB_NOEXCEPT {
1850 return F128::Min(max, F128::Max(min, value));
1854 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds)
NLIB_NOEXCEPT {
1855 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1856 uint32x4_t tmp = vcaleq_f32(value, bounds);
1857 return vreinterpretq_f32_u32(tmp);
1859 return F128::CmpLe(F128::Abs(value), bounds);
1864 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1865 return vreinterpretq_f32_u32(vceqzq_f32(value));
1867 return F128::CmpEq(value, F128::SetZero());
1872 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1873 return vreinterpretq_f32_u32(vcltzq_f32(value));
1875 return F128::CmpLt(value, F128::SetZero());
1880 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1881 return vreinterpretq_f32_u32(vclezq_f32(value));
1883 return F128::CmpLe(value, F128::SetZero());
1888 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1889 return vreinterpretq_f32_u32(vcgtzq_f32(value));
1891 return F128::CmpGt(value, F128::SetZero());
1896 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1897 return vreinterpretq_f32_u32(vcgezq_f32(value));
1899 return F128::CmpGe(value, F128::SetZero());
1904 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1905 return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1907 return F128::CmpNe(value, F128::SetZero());
1912 NLIB_M(f128) F128::CmpNearEqZero(f128arg value, f128arg eps)
NLIB_NOEXCEPT {
1913 f128 tmp = F128::Abs(value);
1914 return F128::CmpLe(tmp, eps);
1919 #ifdef NLIB_F128_SIMD_NOUSE 1921 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1922 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1923 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1924 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1926 #elif defined(NLIB_SSE41) 1927 return _mm_div_ps(F128::SetOne(), value);
1928 #elif defined(NLIB_NEON) 1930 return vdivq_f32(vdupq_n_f32(1.f), value);
1933 x = vrecpeq_f32(value);
1934 x = vmulq_f32(x, vrecpsq_f32(x, value));
1935 x = vmulq_f32(x, vrecpsq_f32(x, value));
1936 uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1937 float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1941 return F128::Div(F128::SetOne(), value);
1947 #ifdef NLIB_F128_SIMD_NOUSE 1949 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1950 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1951 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1952 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1954 #elif defined(NLIB_SSE41) 1955 return _mm_rcp_ps(value);
1956 #elif defined(NLIB_NEON) 1957 return vrecpeq_f32(value);
1960 ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1961 ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1968 #ifdef NLIB_F128_SIMD_NOUSE 1970 ret.vec.v[0] = sqrtf(value.vec.v[0]);
1971 ret.vec.v[1] = sqrtf(value.vec.v[1]);
1972 ret.vec.v[2] = sqrtf(value.vec.v[2]);
1973 ret.vec.v[3] = sqrtf(value.vec.v[3]);
1975 #elif defined(NLIB_SSE41) 1976 return _mm_sqrt_ps(value);
1977 #elif defined(NLIB_NEON) 1978 f128 iszero = F128::CmpEqZero(value);
1979 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1980 return F128::AndNot(iszero, result);
1982 f128 zero = F128::SetZero();
1983 f128 iszero = F128::CmpEq(zero, value);
1984 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1985 return F128::Select(iszero, zero, result);
1991 #ifdef NLIB_F128_SIMD_NOUSE 1993 ret.vec.v[0] = sqrtf(value.vec.v[0]);
1994 ret.vec.v[1] = sqrtf(value.vec.v[1]);
1995 ret.vec.v[2] = sqrtf(value.vec.v[2]);
1996 ret.vec.v[3] = sqrtf(value.vec.v[3]);
1998 #elif defined(NLIB_SSE41) 1999 return _mm_sqrt_ps(value);
2000 #elif defined(NLIB_NEON) 2001 return vrecpeq_f32(vrsqrteq_f32(value));
2004 ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2005 ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2012 #ifdef NLIB_F128_SIMD_NOUSE 2014 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2015 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2016 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2017 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2019 #elif defined(NLIB_SSE41) 2020 return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2021 #elif defined(NLIB_NEON) 2023 x = vrsqrteq_f32(value);
2024 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2025 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2026 f128 zeromask = F128::CmpEqZero(value);
2027 return F128::Select(zeromask, F128::SetInfinity(), x);
2029 f32x2 three = __PS_FDUP(3.f);
2030 f32x2 half = __PS_FDUP(0.5f);
2036 v = value.vec.ps[0];
2039 xx = __PS_MUL(x, x);
2040 xx = __PS_NMSUB(v, xx, three);
2041 xx = __PS_MUL(x, xx);
2042 x = __PS_MUL(half, xx);
2044 xx = __PS_MUL(x, x);
2045 xx = __PS_NMSUB(v, xx, three);
2046 xx = __PS_MUL(x, xx);
2047 ret.vec.ps[0] = __PS_MUL(half, xx);
2049 v = value.vec.ps[1];
2052 xx = __PS_MUL(x, x);
2053 xx = __PS_NMSUB(v, xx, three);
2054 xx = __PS_MUL(x, xx);
2055 x = __PS_MUL(half, xx);
2057 xx = __PS_MUL(x, x);
2058 xx = __PS_NMSUB(v, xx, three);
2059 xx = __PS_MUL(x, xx);
2060 ret.vec.ps[1] = __PS_MUL(half, xx);
2062 f128 iszero = F128::CmpEq(F128::SetZero(), value);
2063 f128 inf = F128::SetInfinity();
2064 return F128::Select(iszero, inf, ret);
2069 NLIB_M(f128) F128::RecpSqrtEst(f128arg value)
NLIB_NOEXCEPT {
2070 #ifdef NLIB_F128_SIMD_NOUSE 2072 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2073 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2074 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2075 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2077 #elif defined(NLIB_SSE41) 2078 return _mm_rsqrt_ps(value);
2079 #elif defined(NLIB_NEON) 2080 return vrsqrteq_f32(value);
2083 ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2084 ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2089 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
2091 const size_t lane0 = NegateLane0 ? 4 : 0;
2092 const size_t lane1 = NegateLane1 ? 5 : 1;
2093 const size_t lane2 = NegateLane2 ? 6 : 2;
2094 const size_t lane3 = NegateLane3 ? 7 : 3;
2095 return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2099 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value)
NLIB_NOEXCEPT {
2104 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value)
NLIB_NOEXCEPT {
2105 return F128::Negate(value);
2108 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2109 #define NLIB_ISNAN(vec, idx) \ 2110 ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0) 2111 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U) 2116 #if defined(NLIB_F128_SIMD_NOUSE) 2118 ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2119 ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2120 ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2121 ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2125 f32x2 one = __PS_FDUP(1.f);
2126 f32x2 minus_one = __PS_NEG(one);
2127 f32x2 v0 = value.vec.ps[0];
2128 f32x2 v1 = value.vec.ps[1];
2129 f32x2 t0 = __PS_SEL(v0, one, minus_one);
2130 f32x2 t1 = __PS_SEL(v1, one, minus_one);
2132 f32x2 v0neg = __PS_NEG(v0);
2133 f32x2 v1neg = __PS_NEG(v1);
2134 ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2135 ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2138 return F128::CmpNe(value, value);
2143 NLIB_M(f128) F128::IsInfinite(f128arg value)
NLIB_NOEXCEPT {
2144 #if defined(NLIB_F128_SIMD_NOUSE) 2146 ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2147 ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2148 ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2149 ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2153 f32x2 big_value = __PS_FDUP(FLT_MAX);
2154 ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2155 ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2158 f128 inf_value = F128::SetInfinity();
2159 f128 abs_value = F128::Abs(value);
2160 return F128::CmpEq(inf_value, abs_value);
2166 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 2167 return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2168 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE) 2169 return vrndaq_f32(value);
2172 f128 sgn = F128::And(value, F128::SetSignMask());
2173 f128 sm = F128::Or(F128::SetValue(0x4B000000U,
each_uint32), sgn);
2174 f128 result = F128::Sub(F128::Add(value, sm), sm);
2183 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2185 for (
size_t i = 0; i < 4; ++i) {
2186 if (NLIB_ISNAN(value.vec, i)) {
2187 ret.vec.u[i] = 0x7FC00000U;
2189 ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2190 ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2195 #elif defined(NLIB_SSE41) 2196 return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2197 #elif defined(NLIB_NEON) 2199 f128 x = F128::Abs(value);
2200 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2201 f128 cond = F128::CmpLt(x, c_2_23);
2202 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2203 return F128::Select(cond, casted, value);
2205 return vrndq_f32(value);
2212 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2214 ret.vec.v[0] = floorf(value.vec.v[0]);
2215 ret.vec.v[1] = floorf(value.vec.v[1]);
2216 ret.vec.v[2] = floorf(value.vec.v[2]);
2217 ret.vec.v[3] = floorf(value.vec.v[3]);
2219 #elif defined(NLIB_SSE41) 2220 return _mm_floor_ps(value);
2221 #elif defined(NLIB_NEON) 2225 f128 x = F128::Abs(value);
2226 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2227 f128 cond = F128::CmpLt(x, c_2_23);
2228 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2231 f128 largeMask = F128::CmpGt(casted, value);
2233 casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(largeMask)));
2234 return F128::Select(cond, casted, value);
2236 return vrndmq_f32(value);
2243 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2245 ret.vec.v[0] = ceilf(value.vec.v[0]);
2246 ret.vec.v[1] = ceilf(value.vec.v[1]);
2247 ret.vec.v[2] = ceilf(value.vec.v[2]);
2248 ret.vec.v[3] = ceilf(value.vec.v[3]);
2250 #elif defined(NLIB_SSE41) 2251 return _mm_ceil_ps(value);
2252 #elif defined(NLIB_NEON) 2256 f128 x = F128::Abs(value);
2257 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2258 f128 cond = F128::CmpLt(x, c_2_23);
2259 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2262 f128 smallMask = F128::CmpLt(casted, value);
2264 casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(smallMask)));
2265 return F128::Select(cond, casted, value);
2267 return vrndpq_f32(value);
2272 #ifdef NLIB_F128_SIMD_NOUSE 2279 return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2283 static const float v_1_2pi = 0.15915494309189535f;
2284 static const float v_2pi = 6.283185307179586f;
2286 const f128 recpTwoPi = F128::SetValue(v_1_2pi, each_float);
2287 f128 round = F128::Round(F128::Mult(value, recpTwoPi));
2288 const f128 twoPi = F128::SetValue(v_2pi, each_float);
2289 return F128::MultSub(twoPi, round, value);
2294 f128 x = F128::ModAngle(value);
2301 f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2305 f128 xabs = F128::Abs(value);
2306 f128 xsign = F128::And(F128::SetSignMask(), x);
2307 f128 mypi = F128::Or(xsign, pi);
2308 f128 pi_x = F128::Sub(mypi, x);
2309 f128 cond = F128::CmpLe(xabs, pidiv2);
2310 x = F128::Select(cond, x, pi_x);
2312 f128 xx = F128::Mult(x, x);
2313 f128 coeff = F128::LoadA16(sin_coeff_);
2317 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2318 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2319 result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue,
each_select32));
2320 result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue,
each_select32));
2321 result = F128::Mult(xx, result);
2322 result = F128::MultSub(result, x, x);
2328 f128 x = F128::ModAngle(value);
2335 f128 cvalue = F128::LoadA16(cos_cvalue_);
2337 f128 xabs = F128::Abs(value);
2338 f128 xsign = F128::And(F128::SetSignMask(), x);
2339 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2340 f128 pi_x = F128::Sub(mypi, x);
2341 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2342 x = F128::Select(cond, x, pi_x);
2345 f128 sign = F128::AndNot(cond, F128::SetSignMask());
2349 f128 xx = F128::Mult(x, x);
2350 f128 coeff = F128::LoadA16(cos_coeff_);
2354 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2355 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2356 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2357 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2358 result = F128::MultSub(xx, result, F128::SetOne());
2359 result = F128::Xor(sign, result);
2365 const f128 signmask = F128::SetSignMask();
2366 f128 x = F128::ModAngle(value);
2373 f128 cvalue = F128::LoadA16(cos_cvalue_);
2375 f128 xabs = F128::Abs(value);
2376 f128 xsign = F128::And(signmask, x);
2377 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2378 f128 pi_x = F128::Sub(mypi, x);
2379 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2380 x = F128::Select(cond, x, pi_x);
2383 f128 sign = F128::AndNot(cond, signmask);
2387 f128 xx = F128::Mult(x, x);
2392 f128 coeff = F128::LoadA16(cos_coeff_);
2397 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2398 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2399 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2400 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2401 result = F128::MultSub(xx, result, F128::SetOne());
2403 ret.val[1] = F128::Xor(sign, result);
2408 f128 coeff = F128::LoadA16(sin_coeff_);
2413 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2414 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2415 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2], each_float));
2416 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3], each_float));
2417 result = F128::Mult(xx, result);
2418 ret.val[0] = F128::MultSub(result, x, x);
2427 f128 cmp, value_sign;
2429 f128 one = F128::SetOne();
2434 value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2435 cmp = F128::CmpLe(F128::Abs(value), one);
2437 f128 x = F128::Select(cmp, value, F128::Recp(value));
2444 f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2445 f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2446 f128 xx = F128::Mult(x, x);
2449 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1,
each_select32));
2450 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1,
each_select32));
2451 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0,
each_select32));
2452 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0,
each_select32));
2453 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0,
each_select32));
2454 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0,
each_select32));
2456 result = F128::Mult(result, x);
2457 result = F128::MultSub(xx, result, x);
2459 f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2460 f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2461 result = F128::Select(cmp, result, result_another);
2465 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x)
NLIB_NOEXCEPT {
2482 const f128 signmask = F128::SetSignMask();
2484 const f128 sy = F128::And(y, signmask);
2485 const f128 infx = F128::IsInfinite(x);
2486 const f128 infy = F128::IsInfinite(y);
2487 const f128 zerox = F128::CmpEqZero(x);
2488 const f128 zeroy = F128::CmpEqZero(y);
2489 const f128 posx = F128::CmpGtZero(x);
2499 const f128 cval = F128::LoadA16(atan2_cvalue_);
2500 const f128 pi = F128::Or(sy, F128::SetValue<0>(cval,
each_select32));
2501 const f128 pi_34 = F128::Or(sy, F128::SetValue<1>(cval,
each_select32));
2502 const f128 pi_2 = F128::Or(sy, F128::SetValue<2>(cval,
each_select32));
2503 const f128 pi_4 = F128::Or(sy, F128::SetValue<3>(cval,
each_select32));
2505 f128 v = F128::Select(
2506 infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2507 F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2512 #if defined(NLIB_F128_SIMD_NOUSE) 2514 mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2515 mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2516 mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2517 mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2521 mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2522 mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2523 mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2524 mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2526 f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v),
2529 f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2530 return F128::Select(mask, result, v);
2535 f128 one = F128::SetOne();
2536 f128 tmp = F128::MultSub(value, value, one);
2537 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2538 return F128::ArcTan2(value, argx);
2543 f128 one = F128::SetOne();
2544 f128 tmp = F128::MultSub(value, value, one);
2545 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2546 return F128::ArcTan2(argx, value);
2551 #ifdef NLIB_F128_SIMD_NOUSE 2553 ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2554 ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2555 ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2556 ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2558 #elif defined(NLIB_SSE41) 2559 return static_cast<uint8_t
>(_mm_movemask_ps(value));
2560 #elif defined(NLIB_NEON) 2561 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2562 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2563 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2564 uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2566 return vaddvq_u32(a);
2568 uint16x4_t tmp = vmovn_u32(a);
2569 tmp = vpadd_u16(tmp, tmp);
2570 tmp = vpadd_u16(tmp, tmp);
2571 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2574 int tmp = (value.vec.u[0] >> 31);
2575 tmp |= (value.vec.u[1] >> 30) & 2;
2576 tmp |= (value.vec.u[2] >> 29) & 4;
2577 tmp |= (value.vec.u[3] >> 28) & 8;
2583 NLIB_M2(
bool) F128::IsAllMaskFalse(f128arg value)
NLIB_NOEXCEPT {
2584 #ifdef NLIB_F128_SIMD_NOUSE 2585 return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2586 #elif defined(NLIB_SSE41) 2587 i128 casted = F128::CastToI128(value);
2588 return _mm_testz_si128(casted, casted) != 0;
2589 #elif defined(NLIB_NEON) 2591 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2592 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2594 int32x4_t casted = vreinterpretq_s32_f32(value);
2595 int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2596 return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2599 uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2600 return (tmp & 0x80000000U) == 0;
2605 NLIB_M2(
bool) F128::IsAllMaskTrue(f128arg value)
NLIB_NOEXCEPT {
2606 #ifdef NLIB_F128_SIMD_NOUSE 2607 return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2608 value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2609 #elif defined(NLIB_SSE41) 2610 i128 casted = F128::CastToI128(value);
2611 return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2612 #elif defined(NLIB_NEON) 2614 uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2615 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2617 int32x4_t casted = vreinterpretq_s32_f32(value);
2618 int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2619 return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2622 uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2623 return (tmp & 0x80000000U) != 0;
2629 NLIB_M(
float) F128::GetFloatFromLane(f128arg value)
NLIB_NOEXCEPT {
2631 #ifdef NLIB_F128_SIMD_NOUSE 2632 return value.vec.v[N];
2633 #elif defined(NLIB_SSE41) 2635 _MM_EXTRACT_FLOAT(dest, value, N);
2637 #elif defined(NLIB_NEON) 2638 return vgetq_lane_f32(value, N);
2640 return value.vec.ps[N / 2][N % 2];
2646 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value)
NLIB_NOEXCEPT {
2648 #ifdef NLIB_F128_SIMD_NOUSE 2649 return value.vec.u[N];
2650 #elif defined(NLIB_SSE41) 2651 return _mm_extract_ps(value, N);
2652 #elif defined(NLIB_NEON) 2653 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2654 return vgetq_lane_u32(tmp, N);
2656 return value.vec.u[N];
2661 NLIB_M2(
float) F128::GetFloatByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT {
2662 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2663 return value.vec.v[idx];
2664 #elif defined(NLIB_SSE41) 2668 _MM_EXTRACT_FLOAT(dest, value, 0);
2671 _MM_EXTRACT_FLOAT(dest, value, 1);
2674 _MM_EXTRACT_FLOAT(dest, value, 2);
2677 _MM_EXTRACT_FLOAT(dest, value, 3);
2684 #elif defined(NLIB_NEON) 2687 return vgetq_lane_f32(value, 0);
2689 return vgetq_lane_f32(value, 1);
2691 return vgetq_lane_f32(value, 2);
2693 return vgetq_lane_f32(value, 3);
2702 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT {
2703 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2704 return value.vec.u[idx];
2705 #elif defined(NLIB_SSE41) 2708 return static_cast<uint32_t
>(_mm_extract_ps(value, 0));
2710 return static_cast<uint32_t
>(_mm_extract_ps(value, 1));
2712 return static_cast<uint32_t
>(_mm_extract_ps(value, 2));
2714 return static_cast<uint32_t
>(_mm_extract_ps(value, 3));
2719 #elif defined(NLIB_NEON) 2720 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2723 return vgetq_lane_u32(tmp, 0);
2725 return vgetq_lane_u32(tmp, 1);
2727 return vgetq_lane_u32(tmp, 2);
2729 return vgetq_lane_u32(tmp, 3);
2739 NLIB_M(f128) F128::SetFloatToLane(f128arg value,
float v)
NLIB_NOEXCEPT {
2741 #ifdef NLIB_F128_SIMD_NOUSE 2745 #elif defined(NLIB_SSE41) 2746 f128 tmp = _mm_set_ss(v);
2747 return _mm_insert_ps(value, tmp, N << 4);
2748 #elif defined(NLIB_NEON) 2749 return __builtin_constant_p(v) ?
2750 F128::Permute<N == 0 ? 4 : 0,
2753 N == 3 ? 7 : 3>(value, vdupq_n_f32(v)) :
2754 vsetq_lane_f32(v, value, N);
2757 ret.vec.ps[N / 2][N % 2] = v;
2763 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value,
float v,
size_t i)
NLIB_NOEXCEPT {
2764 #ifdef NLIB_F128_SIMD_NOUSE 2768 #elif defined(NLIB_SSE41) 2769 f128 tmp = _mm_set_ss(v);
2772 return _mm_insert_ps(value, tmp, 0x00);
2774 return _mm_insert_ps(value, tmp, 0x10);
2776 return _mm_insert_ps(value, tmp, 0x20);
2778 return _mm_insert_ps(value, tmp, 0x30);
2783 #elif defined(NLIB_NEON) 2786 return F128::SetFloatToLane<0>(value, v);
2788 return F128::SetFloatToLane<1>(value, v);
2790 return F128::SetFloatToLane<2>(value, v);
2792 return F128::SetFloatToLane<3>(value, v);
2801 ret.vec.ps[0][0] = v;
2804 ret.vec.ps[0][1] = v;
2807 ret.vec.ps[1][0] = v;
2810 ret.vec.ps[1][1] = v;
2817 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 2820 template <
bool IsHighA,
bool IsHighB>
2821 float32x2_t F64Merge(float32x2_t a, float32x2_t b)
NLIB_NOEXCEPT;
2824 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2826 return vtrn1_f32(a, b);
2828 return vtrn_f32(a, b).val[0];
2833 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2835 return vtrn1_f32(vrev64_f32(a), b);
2837 return vtrn_f32(vrev64_f32(a), b).val[0];
2842 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2844 return vtrn1_f32(a, vrev64_f32(b));
2846 return vtrn_f32(a, vrev64_f32(b)).val[0];
2851 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2853 return vtrn2_f32(a, b);
2855 return vtrn_f32(a, b).val[1];
2864 return vget_low_f32(value);
2869 return vget_high_f32(value);
2872 template <
int X0,
int X1>
2873 struct F128SwizzleHelper2 {
2875 float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2876 float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2877 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2882 struct F128SwizzleHelper2<X, X> {
2884 float32x2_t x = F128SwizzleGet64<X / 2>(value);
2885 return vdup_lane_f32(x, (X & 1));
2890 struct F128SwizzleHelper2<0, 1> {
2892 return vget_low_f32(value);
2897 struct F128SwizzleHelper2<0, 2> {
2900 return vget_low_f32(vuzp1q_f32(value, value));
2902 float32x2_t lo = vget_low_f32(value);
2903 float32x2_t hi = vget_high_f32(value);
2904 return vzip_f32(lo, hi).val[0];
2910 struct F128SwizzleHelper2<0, 3> {
2912 float32x2_t lo = vget_low_f32(value);
2913 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2915 return vzip1_f32(lo, hi);
2917 return vzip_f32(lo, hi).val[0];
2923 struct F128SwizzleHelper2<1, 0> {
2925 return vrev64_f32(vget_low_f32(value));
2930 struct F128SwizzleHelper2<1, 2> {
2932 float32x2_t lo = vget_low_f32(value);
2933 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2935 return vzip2_f32(lo, hi);
2937 return vzip_f32(lo, hi).val[1];
2943 struct F128SwizzleHelper2<1, 3> {
2946 return vget_low_f32(vuzp2q_f32(value, value));
2948 float32x2_t lo = vget_low_f32(value);
2949 float32x2_t hi = vget_high_f32(value);
2950 return vzip_f32(lo, hi).val[1];
2956 struct F128SwizzleHelper2<2, 0> {
2959 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2961 float32x2_t lo = vget_low_f32(value);
2962 float32x2_t hi = vget_high_f32(value);
2963 return vzip_f32(hi, lo).val[0];
2969 struct F128SwizzleHelper2<2, 1> {
2972 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2974 float32x2_t lo = vget_low_f32(value);
2975 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2976 return vzip_f32(hi, lo).val[1];
2982 struct F128SwizzleHelper2<2, 3> {
2984 return vget_high_f32(value);
2989 struct F128SwizzleHelper2<3, 0> {
2991 float32x2_t lo = vget_low_f32(value);
2992 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2994 return vzip1_f32(hi, lo);
2996 return vzip_f32(hi, lo).val[0];
3002 struct F128SwizzleHelper2<3, 1> {
3004 float32x2_t lo = vget_low_f32(value);
3005 float32x2_t hi = vget_high_f32(value);
3007 return vzip2_f32(hi, lo);
3009 return vzip_f32(hi, lo).val[1];
3015 struct F128SwizzleHelper2<3, 2> {
3017 return vrev64_f32(vget_high_f32(value));
3021 template <
int V0,
int V1,
int V2,
int V3>
3022 struct F128SwizzleHelper {
3024 return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3025 detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3029 template <
int Vx,
int Vy>
3030 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3032 float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3033 return vcombine_f32(tmp, tmp);
3038 struct F128SwizzleHelper<V, V, V, V> {
3045 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3048 template <
int X0,
int X1>
3049 struct F128SwizzleHelper {
3054 struct F128SwizzleHelper<0, 0> {
3057 return __PS_MERGE00(v0, v0);
3062 struct F128SwizzleHelper<0, 1> {
3070 struct F128SwizzleHelper<0, 2> {
3072 return __PS_MERGE00(v0, v1);
3077 struct F128SwizzleHelper<0, 3> {
3079 return __PS_MERGE01(v0, v1);
3084 struct F128SwizzleHelper<1, 0> {
3087 return __PS_MERGE10(v0, v0);
3092 struct F128SwizzleHelper<1, 1> {
3095 return __PS_MERGE11(v0, v0);
3100 struct F128SwizzleHelper<1, 2> {
3102 return __PS_MERGE10(v0, v1);
3107 struct F128SwizzleHelper<1, 3> {
3109 return __PS_MERGE11(v0, v1);
3114 struct F128SwizzleHelper<2, 0> {
3116 return __PS_MERGE00(v1, v0);
3121 struct F128SwizzleHelper<2, 1> {
3123 return __PS_MERGE01(v1, v0);
3128 struct F128SwizzleHelper<2, 2> {
3131 return __PS_MERGE00(v1, v1);
3136 struct F128SwizzleHelper<2, 3> {
3144 struct F128SwizzleHelper<3, 0> {
3146 return __PS_MERGE10(v1, v0);
3151 struct F128SwizzleHelper<3, 1> {
3153 return __PS_MERGE11(v1, v0);
3158 struct F128SwizzleHelper<3, 2> {
3161 return __PS_MERGE10(v1, v1);
3166 struct F128SwizzleHelper<3, 3> {
3169 return __PS_MERGE11(v1, v1);
3176 template <
int V0,
int V1,
int V2,
int V3>
3183 #if defined(NLIB_F128_SIMD_NOUSE) 3185 ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3186 ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3187 ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3188 ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3190 #elif __has_builtin(__builtin_shufflevector) 3191 return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3192 #elif defined(NLIB_SSE41) 3193 return _mm_shuffle_ps(value, value,
3194 _MM_SHUFFLE(V3 != -1 ? V3 : 3,
3197 V0 != -1 ? V0 : 0));
3198 #elif defined(NLIB_NEON) 3199 return detail::F128SwizzleHelper<
3203 V3 != -1 ? V3 : 3>::Swizzle(value);
3206 ret.vec.ps[0] = detail::F128SwizzleHelper<
3207 (V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3208 ret.vec.ps[1] = detail::F128SwizzleHelper<
3209 (V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3214 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3217 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value)
NLIB_NOEXCEPT {
3219 return vzip1q_f32(value, value);
3221 return vzipq_f32(value, value).val[0];
3225 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value)
NLIB_NOEXCEPT {
3227 return vtrn1q_f32(value, value);
3229 return vtrnq_f32(value, value).val[0];
3233 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value)
NLIB_NOEXCEPT {
3237 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value)
NLIB_NOEXCEPT {
3239 return vuzp1q_f32(value, value);
3241 return vuzpq_f32(value, value).val[0];
3245 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value)
NLIB_NOEXCEPT {
3246 return vrev64q_f32(value);
3249 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value)
NLIB_NOEXCEPT {
3251 return vtrn2q_f32(value, value);
3253 return vtrnq_f32(value, value).val[1];
3257 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value)
NLIB_NOEXCEPT {
3258 uint32x4_t ival = vreinterpretq_u32_f32(value);
3259 uint32x4_t rotated = vextq_u32(ival, ival, 1);
3260 return vreinterpretq_f32_u32(rotated);
3263 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value)
NLIB_NOEXCEPT {
3265 return vuzp2q_f32(value, value);
3267 return vuzpq_f32(value, value).val[1];
3271 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value)
NLIB_NOEXCEPT {
3273 return vzip2q_f32(value, value);
3275 return vzipq_f32(value, value).val[1];
3279 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value)
NLIB_NOEXCEPT {
3280 uint32x4_t ival = vreinterpretq_u32_f32(value);
3281 uint32x4_t rotated = vextq_u32(ival, ival, 2);
3282 return vreinterpretq_f32_u32(rotated);
3285 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value)
NLIB_NOEXCEPT {
3286 uint32x4_t ival = vreinterpretq_u32_f32(value);
3287 uint32x4_t rotated = vextq_u32(ival, ival, 3);
3288 return vreinterpretq_f32_u32(rotated);
3294 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3295 template <
bool UseBlend,
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3296 struct F128PermuteHelper2 {
3297 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3298 f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3299 f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3300 return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3301 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3305 template <
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3306 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3307 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3308 return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3309 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3313 template <
int V0,
int V1,
int V2,
int V3>
3314 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3315 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3316 return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3317 _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3322 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3323 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3324 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3325 return _mm_castsi128_ps(tmp);
3330 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3331 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3332 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3333 return _mm_castsi128_ps(tmp);
3338 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3339 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3340 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3341 return _mm_castsi128_ps(tmp);
3346 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3347 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3349 return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3354 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3355 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3357 return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3362 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3363 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3365 return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3370 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3371 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3373 return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3378 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3379 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3381 return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3386 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3387 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3389 return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3394 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3395 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3397 return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3402 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3403 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3405 return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3409 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3410 struct F128PermuteHelper {
3411 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3412 return F128PermuteHelper2<
3413 ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3414 ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3415 V0, V1, V2, V3>::Permute(a, b);
3419 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3422 float32x2_t F128PermuteGet64(f128arg a, f128arg b)
NLIB_NOEXCEPT;
3425 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3427 return vget_low_f32(a);
3430 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3432 return vget_high_f32(a);
3435 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3437 return vget_low_f32(b);
3440 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3442 return vget_high_f32(b);
3445 template <
int X0,
int X1>
3446 struct F128PermuteHelper2 {
3448 float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3449 float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3450 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3455 struct F128PermuteHelper2<X, X> {
3457 float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3458 return vdup_lane_f32(x, (X & 1));
3462 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3463 struct F128PermuteHelper {
3464 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3465 return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3466 F128PermuteHelper2<V2, V3>::Permute(a, b));
3471 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3472 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3473 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3474 return vreinterpretq_f32_s32(tmp);
3479 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3480 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3481 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3482 return vreinterpretq_f32_s32(tmp);
3487 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3488 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3489 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3490 return vreinterpretq_f32_s32(tmp);
3493 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3494 template<
int R0,
int R1,
int VAR0,
int VAR1>
3495 struct F128PermuteHelper2 {
3496 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT;
3499 template<
int R0,
int R1>
3500 struct F128PermuteHelper2<R0, R1, 0, 0> {
3501 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3502 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3506 template<
int R0,
int R1>
3507 struct F128PermuteHelper2<R0, R1, 0, 1> {
3508 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3509 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3513 template<
int R0,
int R1>
3514 struct F128PermuteHelper2<R0, R1, 0, 2> {
3515 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3516 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3520 template<
int R0,
int R1>
3521 struct F128PermuteHelper2<R0, R1, 0, 3> {
3522 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3523 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3527 template<
int R0,
int R1>
3528 struct F128PermuteHelper2<R0, R1, 1, 0> {
3529 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3530 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3534 template<
int R0,
int R1>
3535 struct F128PermuteHelper2<R0, R1, 1, 1> {
3536 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3537 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3541 template<
int R0,
int R1>
3542 struct F128PermuteHelper2<R0, R1, 1, 2> {
3543 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3544 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3548 template<
int R0,
int R1>
3549 struct F128PermuteHelper2<R0, R1, 1, 3> {
3550 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3551 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3555 template<
int R0,
int R1>
3556 struct F128PermuteHelper2<R0, R1, 2, 0> {
3557 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3558 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3562 template<
int R0,
int R1>
3563 struct F128PermuteHelper2<R0, R1, 2, 1> {
3564 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3565 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3569 template<
int R0,
int R1>
3570 struct F128PermuteHelper2<R0, R1, 2, 2> {
3571 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3572 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3576 template<
int R0,
int R1>
3577 struct F128PermuteHelper2<R0, R1, 2, 3> {
3578 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3579 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3583 template<
int R0,
int R1>
3584 struct F128PermuteHelper2<R0, R1, 3, 0> {
3585 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3586 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3590 template<
int R0,
int R1>
3591 struct F128PermuteHelper2<R0, R1, 3, 1> {
3592 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3593 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3597 template<
int R0,
int R1>
3598 struct F128PermuteHelper2<R0, R1, 3, 2> {
3599 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3600 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3604 template<
int R0,
int R1>
3605 struct F128PermuteHelper2<R0, R1, 3, 3> {
3606 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3607 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3611 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3612 struct F128PermuteHelper {
3613 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3615 f32x2 x0 = a.vec.ps[0];
3616 f32x2 x1 = a.vec.ps[1];
3617 f32x2 x2 = b.vec.ps[0];
3618 f32x2 x3 = b.vec.ps[1];
3619 ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3620 ::Permute(x0, x1, x2, x3);
3621 ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3622 ::Permute(x0, x1, x2, x3);
3627 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3628 struct F128PermuteHelper {
3629 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3630 f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3631 F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3632 F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3633 F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3639 template <
int V0,
int V1,
int V2,
int V3>
3640 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3641 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3643 return F128::Swizzle<V0, V1, V2, V3>(a);
3647 template <
int V0,
int V1,
int V2,
int V3>
3648 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3649 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3651 return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3655 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3658 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3659 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3660 return _mm_unpacklo_ps(a, b);
3664 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3665 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3666 return _mm_unpacklo_ps(b, a);
3670 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3671 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3672 return _mm_unpackhi_ps(a, b);
3676 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3677 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3678 return _mm_unpackhi_ps(b, a);
3683 template<
int V0,
int V1,
int V2,
int V3>
3684 struct F128PermuteDontCareHelper {
3685 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3690 static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3691 static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3692 return detail::F128PermuteHelper< arg1, arg2,
3693 V0, V1, V2, V3 >::Permute(a, b);
3697 template<
int V1,
int V2,
int V3>
3698 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3699 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3703 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3704 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3708 template<
int V0,
int V2,
int V3>
3709 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3710 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3714 static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3715 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3719 template<
int V0,
int V1,
int V3>
3720 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3721 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3725 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3726 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3730 template<
int V0,
int V1,
int V2>
3731 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3732 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3736 static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3737 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3741 template<
int V2,
int V3>
3742 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3743 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3746 static const int V0 = (V2 < 4) ? 0 : 4;
3747 return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3751 template<
int V1,
int V2>
3752 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3753 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3756 static const int V0 = (V1 & 1) ? V1 - 1: V1;
3757 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3758 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3762 template<
int V0,
int V1>
3763 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3764 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3767 static const int V2 = (V1 < 4) ? 2 : 6;
3768 return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3772 template<
int V0,
int V3>
3773 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3774 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3777 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3778 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3779 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3783 template<
int V0,
int V2>
3784 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3785 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3788 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3789 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3790 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3794 template<
int V1,
int V3>
3795 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3796 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3799 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3800 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3801 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3806 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3807 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3809 static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3810 static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3811 static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3812 return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3817 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3818 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3820 static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3821 static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3822 static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3823 return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3828 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3829 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3831 static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3832 static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3833 static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3834 return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3839 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3840 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3842 static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3843 static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3844 static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3845 return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3850 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3851 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3859 template <
int V0,
int V1,
int V2,
int V3>
3861 NLIB_M(f128) F128::Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3862 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE) 3863 return __builtin_shufflevector(a, b,
3864 (V0 != 8 ? V0 : -1),
3865 (V1 != 8 ? V1 : -1),
3866 (V2 != 8 ? V2 : -1),
3867 (V3 != 8 ? V3 : -1));
3869 return detail::F128PermuteDontCareHelper <
3873 V3 != -1 ? V3 : 8>::Permute(a, b);
3877 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
3880 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat)
NLIB_NOEXCEPT {
3881 #if defined(NLIB_NEON) 3882 const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3883 const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3884 const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3885 const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3888 const int v0 = SplatLane0 ? 4 : 0;
3889 const int v1 = SplatLane1 ? 5 : 1;
3890 const int v2 = SplatLane2 ? 6 : 2;
3891 const int v3 = SplatLane3 ? 7 : 3;
3893 return F128::Permute<v0, v1, v2, v3>(value, splat);
3897 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 3899 ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3900 ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3901 ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3902 ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3905 i128 iround = F128::ConvertToI128Round(value);
3906 f128 fround = F128::ConvertFromI128(iround);
3907 f128 x = F128::Sub(value, fround);
3908 f128 xx = F128::Mult(x, x);
3910 f128 P = F128::LoadA16(F128::exp2_P_);
3911 f128 Q = F128::LoadA16(F128::exp2_Q_);
3919 px = F128::MultAdd(px, xx, F128::SetValue<2>(P,
each_select32));
3920 px = F128::Mult(x, px);
3926 qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q,
each_select32));
3928 x = F128::Div(px, F128::Sub(qx, px));
3932 iround = I128::Add32(iround, I128::SetValue(127,
each_int32));
3933 iround = I128::ShiftLeftLogical32(iround, 23);
3934 x = F128::Mult(x, F128::CastFromI128(iround));
3943 static const float log2e = 1.44269504088896340736f;
3944 return Exp2(F128::Mult(log2e, value));
3948 static const float log2e = 1.44269504088896340736f;
3949 f128 negOne = F128::SetValue(-1.f, each_float);
3950 f128 v0 = F128::MultAdd(log2e, value, negOne);
3951 f128 v1 = F128::MultSub(log2e, value, negOne);
3954 return F128::Sub(e0, e1);
3958 static const float log2e = 1.44269504088896340736f;
3959 f128 negOne = F128::SetValue(-1.f, each_float);
3960 f128 v0 = F128::MultAdd(log2e, value, negOne);
3961 f128 v1 = F128::MultSub(log2e, value, negOne);
3964 return F128::Add(e0, e1);
3969 f128 cvalue = F128::LoadA16(tanh_cvalue_);
3973 e = F128::MultAdd(half, e, half);
3975 return F128::Sub(F128::SetValue<1>(cvalue,
each_select32), e);
3979 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 3981 ret.vec.v[0] = tanf(value.vec.v[0]);
3982 ret.vec.v[1] = tanf(value.vec.v[1]);
3983 ret.vec.v[2] = tanf(value.vec.v[2]);
3984 ret.vec.v[3] = tanf(value.vec.v[3]);
3988 f128 C = F128::LoadA16(&F128::tan_c_[0]);
3991 f128 g = F128::Round(F128::Mult<0>(C, value,
each_select32));
3994 i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U,
each_uint32));
3995 i128 cmp = I128::CmpEq32(t0, I128::SetZero());
3996 nearXAxis = F128::CastFromI128(cmp);
4003 f128 nearAxis = F128::CmpNearEqZero(f, F128::SetValue<3>(C,
each_select32));
4005 f128 P = F128::LoadA16(&F128::tan_p_[0]);
4006 f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4008 f128 ff = F128::Mult(f, f);
4012 p = F128::MultAdd(p, ff, F128::SetValue<0>(P,
each_select32));
4013 p = F128::MultAdd(p, ff, one);
4014 p = F128::Mult(f, p);
4017 q = F128::MultAdd(q, ff, F128::SetValue<1>(Q,
each_select32));
4018 q = F128::MultAdd(q, ff, F128::SetValue<0>(Q,
each_select32));
4019 q = F128::MultAdd(q, ff, one);
4021 p = F128::Select(nearAxis, f, p);
4022 q = F128::Select(nearAxis, one, q);
4024 f128 r0 = F128::Div(p, q);
4025 f128 r1 = F128::Negate(F128::Recp(r0));
4027 return F128::Select(nearXAxis, r0, r1);
4032 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 4033 static const float scale = 1.4426950408889634f;
4035 ret.vec.v[0] = logf(value.vec.v[0]);
4036 ret.vec.v[1] = logf(value.vec.v[1]);
4037 ret.vec.v[2] = logf(value.vec.v[2]);
4038 ret.vec.v[3] = logf(value.vec.v[3]);
4039 return F128::Mult(scale, ret);
4042 f128 x = F128::And(F128::SetValue(0x807FFFFFU,
each_uint32), value);
4043 x = F128::Or(F128::SetValue(127U << 23,
each_uint32), x);
4044 i128 e = I128::And(I128::SetValue(0x7F800000U,
each_uint32), F128::CastToI128(value));
4045 e = I128::ShiftRightLogical32(e, 23);
4046 e = I128::Sub32(e, I128::SetValue(127U,
each_uint32));
4048 x = F128::Sub(x, F128::SetOne());
4049 f128 z = F128::Mult(x, x);
4052 f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4053 f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4054 f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4057 p = F128::MultAdd(p, x, F128::SetValue<1>(pq0,
each_select32));
4058 p = F128::MultAdd(p, x, F128::SetValue<2>(pq0,
each_select32));
4059 p = F128::MultAdd(p, x, F128::SetValue<3>(pq0,
each_select32));
4060 p = F128::MultAdd(p, x, F128::SetValue<0>(pq1,
each_select32));
4061 p = F128::MultAdd(p, x, F128::SetValue<1>(pq1,
each_select32));
4063 f128 q = F128::Add(x, F128::SetValue<2>(pq1,
each_select32));
4064 q = F128::MultAdd(q, x, F128::SetValue<3>(pq1,
each_select32));
4065 q = F128::MultAdd(q, x, F128::SetValue<0>(pq2,
each_select32));
4066 q = F128::MultAdd(q, x, F128::SetValue<1>(pq2,
each_select32));
4067 q = F128::MultAdd(q, x, F128::SetValue<2>(pq2,
each_select32));
4069 y = F128::Mult(z, p);
4070 y = F128::Div(y, q);
4071 y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4077 result = F128::Mult(y, log2ea);
4078 result = F128::MultAdd(log2ea, x, result);
4079 result = F128::Add(result, y);
4080 result = F128::Add(result, x);
4081 result = F128::Add(result, F128::ConvertFromI128(e));
4085 f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4088 f128 is_nan = F128::IsNaN(value);
4090 result = F128::Select(is_nan, nan, result);
4092 f128 is_inf = F128::IsInfinite(value);
4093 f128 is_pos = F128::CmpGtZero(value);
4097 f128 is_pos_inf = F128::And(is_inf, is_pos);
4098 result = F128::Select(is_pos_inf, inf, result);
4102 f128 is_zero = F128::CmpEqZero(value);
4103 result = F128::Select(is_zero, neg_inf, result);
4107 f128 is_neg = F128::CmpLtZero(value);
4108 result = F128::Select(is_neg, neg_nan, result);
4118 #ifdef NLIB_F128_SIMD_NOUSE 4120 ret.vec.v[0] = logf(value.vec.v[0]);
4121 ret.vec.v[1] = logf(value.vec.v[1]);
4122 ret.vec.v[2] = logf(value.vec.v[2]);
4123 ret.vec.v[3] = logf(value.vec.v[3]);
4126 f128 x = F128::Log2(value);
4127 static const float recp_log2e = 0.6931471805597018f;
4128 return F128::Mult(recp_log2e, x);
4134 #endif // NLIB_DOXYGEN 4145 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4158 SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
float m12,
4159 float m13,
float m20,
float m21,
float m22,
float m23,
float m30,
float m31,
4167 inline SimdMatrix::SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
4168 float m12,
float m13,
float m20,
float m21,
float m22,
float m23,
4170 r[0] = F128::SetValue(m00, m01, m02, m03);
4171 r[1] = F128::SetValue(m10, m11, m12, m13);
4172 r[2] = F128::SetValue(m20, m21, m22, m23);
4173 r[3] = F128::SetValue(m30, m31, m32, m33);
4177 uintptr_t algn =
reinterpret_cast<uintptr_t
>(p) & 15;
4178 NLIB_ASSERT((algn & 3) == 0);
4179 switch (algn >> 2) {
4181 r[0] = F128::LoadA16(p);
4182 r[1] = F128::LoadA16(p + 4);
4183 r[2] = F128::LoadA16(p + 8);
4184 r[3] = F128::LoadA16(p + 12);
4187 r[0] = F128::LoadA4(p);
4188 r[1] = F128::LoadA4(p + 4);
4189 r[2] = F128::LoadA4(p + 8);
4190 r[3] = F128::LoadA4(p + 12);
4193 r[0] = F128::LoadA8(p);
4194 r[1] = F128::LoadA8(p + 4);
4195 r[2] = F128::LoadA8(p + 8);
4196 r[3] = F128::LoadA8(p + 12);
4204 #if (defined(_MSC_VER) && _MSC_VER < 1800) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 4210 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE) 4211 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4213 f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \ 4214 f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \ 4215 f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \ 4216 f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \ 4217 row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \ 4218 row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \ 4219 row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \ 4220 row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \ 4222 #elif defined(NLIB_NEON) 4224 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4226 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4227 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4228 uint64x2_t row0_, row1_, row2_, row3_; \ 4229 row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4230 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4231 row0 = vreinterpretq_f32_u64(row0_); \ 4232 row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4233 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4234 row1 = vreinterpretq_f32_u64(row1_); \ 4235 row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4236 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4237 row2 = vreinterpretq_f32_u64(row2_); \ 4238 row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4239 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4240 row3 = vreinterpretq_f32_u64(row3_); \ 4243 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4245 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4246 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4247 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \ 4248 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \ 4249 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \ 4250 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \ 4254 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4257 tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \ 4258 tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \ 4259 row0.vec.ps[0] = tmp0; \ 4260 row1.vec.ps[0] = tmp1; \ 4261 tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \ 4262 tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \ 4263 row2.vec.ps[1] = tmp0; \ 4264 row3.vec.ps[1] = tmp1; \ 4265 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4266 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4267 row0.vec.ps[1] = row2.vec.ps[0]; \ 4268 row1.vec.ps[1] = row3.vec.ps[0]; \ 4269 row2.vec.ps[0] = tmp0; \ 4270 row3.vec.ps[0] = tmp1; \ 4271 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4272 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4273 row0.vec.ps[1] = tmp0; \ 4274 row1.vec.ps[1] = tmp1; \ 4299 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4307 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4315 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4326 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ float x
The x-coordinate of the 3D vector.
SimdMatrix()
Instantiates the object with default parameters (default constructor).
The class with the collection of functions that handle 4x4 matrices.
Class representing the view frustum.
The class with the collection of functions that handle quaternions.
f128arg SimdVectorArg
f128arg is defined using typedef.
The type for two SIMD registers for 128-bit, single-precision, floating-point numbers.
float x
The x-coordinate for the 4D vector.
float y
The y-coordinate of the 4D vector.
Implements the class and functions for SIMD computations on integers.
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
Sets up the matrix from the parameters.
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
f128arg SimdSphereArg
f128arg is defined using typedef.
Class for representing oriented bounding boxes (OBB). This class has data members to hold the center ...
static f128 ShiftRight(f128arg a, f128arg b) noexcept
Sets the elements of b in shifted order to the portion of a that becomes empty when a is shifted to t...
The tag for representing a single-precision floating-point number with an empty structure.
The class with the collection of functions that determine containment relations.
The class with the collection of static member functions that handle spheres in three-dimensional spa...
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
f128arg SimdQuaternionArg
f128arg is defined using typedef.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
The class with the collection of functions that handle planes in three-dimensional space...
f128arg SimdPlaneArg
f128arg is defined using typedef.
The class with the collection of functions that perform calculations on three-dimensional vectors...
static f128 RotateLeft(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the left by the amount of N...
The class with the collection of functions that perform square-of-distance calculations.
The type for reading and writing four-dimensional vectors in memory. Keeps float-type x...
const f128 f128arg
const f128 or const f128& is defined using typedef.
The structure for keeping a 4x4 matrix.
float z
The z-coordinate of the 4D vector.
nlib_f128x2_t f128x2
nlib_f128x2_t is defined using typedef.
f128 SimdSphere
f128 is defined using typedef. Used when handling spheres.
The class for single-precision floating point SIMD computations using128-bit registers (MM0-XMM15 for...
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
A file that contains the configuration information for each development environment.
The class with the collection of functions that perform calculations on four-dimensional vectors...
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
The tag for representing an unsigned 32-bit integer with an empty structure.
static f128 RotateRight(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the right by the amount of N...
float y
The y-coordinate of the 3D vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
float z
The z-coordinate of the 3D vector.
Class for representing axis-aligned bounding boxes (AABB). The class has data members to hold the min...
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
float w
The w-coordinate of the 4D vector.
The class with the collection of functions that determine intersections.
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
__m128 nlib_f128_t
The type for a SIMD register for 128-bit, single-precision, floating-point numbers.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...