16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 19 #ifdef NN_PLATFORM_CTR 20 # ifndef __USE_C99_MATH 21 # define __USE_C99_MATH 31 #define INFINITY ((float)(1e+300 * 1e+300)) 34 #if !defined(NLIB_SIMD) && !defined(CAFE) 35 #define NLIB_F128_SIMD_NOUSE 38 #ifdef NLIB_F128_SIMD_NOUSE 46 #elif defined(NLIB_SSE41) 49 #elif defined(NLIB_NEON) 75 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 81 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 82 typedef const f128& f128arg_ex;
84 typedef const f128 f128arg_ex;
87 #if !defined(_MSC_VER) && !defined(__vectorcall) 95 static f128 __vectorcall SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT;
104 static f128 __vectorcall SetZeroToLane(f128arg value)
NLIB_NOEXCEPT;
112 static f128 __vectorcall LoadA16(
const float* p)
NLIB_NOEXCEPT;
113 static f128 __vectorcall LoadA8(
const float* p)
NLIB_NOEXCEPT;
114 static f128 __vectorcall LoadA4(
const float* p)
NLIB_NOEXCEPT;
122 static void __vectorcall StoreA16(
float* p, f128arg value)
NLIB_NOEXCEPT;
123 static void __vectorcall StoreA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
124 static void __vectorcall StoreA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
125 static void __vectorcall StoreA16(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
126 static void __vectorcall StoreA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
127 static void __vectorcall StoreA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
128 static void __vectorcall StoreA16(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
129 static void __vectorcall StoreA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
130 static void __vectorcall StoreA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
132 static void __vectorcall StoreLoA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
133 static void __vectorcall StoreLoA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
134 static void __vectorcall StoreLoA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
135 static void __vectorcall StoreLoA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
136 static void __vectorcall StoreLoA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
137 static void __vectorcall StoreLoA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
139 static void __vectorcall StoreHiA8(
float* p, f128arg value)
NLIB_NOEXCEPT;
140 static void __vectorcall StoreHiA4(
float* p, f128arg value)
NLIB_NOEXCEPT;
141 static void __vectorcall StoreHiA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
142 static void __vectorcall StoreHiA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT;
143 static void __vectorcall StoreHiA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
144 static void __vectorcall StoreHiA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT;
152 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 161 static f128 __vectorcall ConvertFromFixedPoint(i128arg value)
NLIB_NOEXCEPT;
166 static f128 __vectorcall Add(f128arg a, f128arg b)
NLIB_NOEXCEPT;
167 static f128 __vectorcall Sub(f128arg a, f128arg b)
NLIB_NOEXCEPT;
168 static f128 __vectorcall Mult(f128arg a, f128arg b)
NLIB_NOEXCEPT;
169 static f128 __vectorcall Mult(
float a, f128arg b)
NLIB_NOEXCEPT;
172 static f128 __vectorcall Div(f128arg a, f128arg b)
NLIB_NOEXCEPT;
173 static f128 __vectorcall Negate(f128arg value)
NLIB_NOEXCEPT;
174 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
175 static f128 __vectorcall NegateEx(f128arg value)
NLIB_NOEXCEPT;
176 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
177 static f128 __vectorcall MultAdd(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
179 static f128 __vectorcall MultAdd(f128arg a, f128arg b, f128arg c,
181 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
182 static f128 __vectorcall MultSub(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT;
184 static f128 __vectorcall MultSub(f128arg a, f128arg b, f128arg c,
186 static f128 __vectorcall PairwiseAdd(f128arg a, f128arg b)
NLIB_NOEXCEPT;
188 static f128 __vectorcall AbsDiff(f128arg a, f128arg b)
NLIB_NOEXCEPT;
194 static f128 __vectorcall Max(f128arg a, f128arg b)
NLIB_NOEXCEPT;
195 static f128 __vectorcall Min(f128arg a, f128arg b)
NLIB_NOEXCEPT;
196 static f128 __vectorcall PairwiseMax(f128arg a, f128arg b)
NLIB_NOEXCEPT;
197 static f128 __vectorcall PairwiseMin(f128arg a, f128arg b)
NLIB_NOEXCEPT;
198 static f128 __vectorcall Clamp(f128arg value, f128arg min, f128arg max)
NLIB_NOEXCEPT;
199 static f128 __vectorcall Saturate(f128arg value)
NLIB_NOEXCEPT;
206 static f128 __vectorcall RecpEst(f128arg value)
NLIB_NOEXCEPT;
208 static f128 __vectorcall SqrtEst(f128arg value)
NLIB_NOEXCEPT;
209 static f128 __vectorcall RecpSqrt(f128arg value)
NLIB_NOEXCEPT;
210 static f128 __vectorcall RecpSqrtEst(f128arg value)
NLIB_NOEXCEPT;
217 static f128 __vectorcall Truncate(f128arg value)
NLIB_NOEXCEPT;
225 static f128 __vectorcall And(f128arg a, f128arg b)
NLIB_NOEXCEPT;
226 static f128 __vectorcall Or(f128arg a, f128arg b)
NLIB_NOEXCEPT;
227 static f128 __vectorcall Xor(f128arg a, f128arg b)
NLIB_NOEXCEPT;
229 static f128 __vectorcall AndNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
230 static f128 __vectorcall OrNot(f128arg a, f128arg b)
NLIB_NOEXCEPT;
236 static f128 __vectorcall CmpEq(f128arg a, f128arg b)
NLIB_NOEXCEPT;
237 static f128 __vectorcall CmpLt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
238 static f128 __vectorcall CmpLe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
239 static f128 __vectorcall CmpGt(f128arg a, f128arg b)
NLIB_NOEXCEPT;
240 static f128 __vectorcall CmpGe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
241 static f128 __vectorcall CmpNe(f128arg a, f128arg b)
NLIB_NOEXCEPT;
242 static f128 __vectorcall CmpNearEq(f128arg a, f128arg b, f128arg eps)
NLIB_NOEXCEPT;
243 static f128 __vectorcall InBound(f128arg value, f128arg bounds)
NLIB_NOEXCEPT;
245 static f128 __vectorcall CmpEqZero(f128arg value)
NLIB_NOEXCEPT;
246 static f128 __vectorcall CmpLtZero(f128arg value)
NLIB_NOEXCEPT;
247 static f128 __vectorcall CmpLeZero(f128arg value)
NLIB_NOEXCEPT;
248 static f128 __vectorcall CmpGtZero(f128arg value)
NLIB_NOEXCEPT;
249 static f128 __vectorcall CmpGeZero(f128arg value)
NLIB_NOEXCEPT;
250 static f128 __vectorcall CmpNeZero(f128arg value)
NLIB_NOEXCEPT;
251 static f128 __vectorcall CmpNearEqZero(f128arg value, f128arg eps)
NLIB_NOEXCEPT;
256 static f128 __vectorcall AddAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
257 static f128 __vectorcall SubAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT;
258 static f128 __vectorcall ModAngle(f128arg value)
NLIB_NOEXCEPT;
261 static f128x2 __vectorcall SinCos(f128arg value)
NLIB_NOEXCEPT;
266 static f128 __vectorcall ArcSin(f128arg value)
NLIB_NOEXCEPT;
267 static f128 __vectorcall ArcCos(f128arg value)
NLIB_NOEXCEPT;
268 static f128 __vectorcall ArcTan(f128arg value)
NLIB_NOEXCEPT;
269 static f128 __vectorcall ArcTan2(f128arg y, f128arg x)
NLIB_NOEXCEPT;
276 static f128 __vectorcall Lerp(f128arg a, f128arg b, f128arg t)
NLIB_NOEXCEPT;
277 static f128 __vectorcall
278 Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1, f128arg_ex t)
NLIB_NOEXCEPT;
279 static f128 __vectorcall
280 CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3, f128arg_ex t)
NLIB_NOEXCEPT;
281 static f128 __vectorcall
282 BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f, f128arg_ex g)
NLIB_NOEXCEPT;
296 static int __vectorcall MoveMask(f128arg value)
NLIB_NOEXCEPT;
297 static bool __vectorcall IsAllMaskFalse(f128arg value)
NLIB_NOEXCEPT;
298 static bool __vectorcall IsAllMaskTrue(f128arg value)
NLIB_NOEXCEPT;
299 static f128 __vectorcall Select(f128arg mask, f128arg a, f128arg b)
NLIB_NOEXCEPT;
301 static f128 __vectorcall IsInfinite(f128arg value)
NLIB_NOEXCEPT;
308 static float __vectorcall GetFloatFromLane(f128arg value)
NLIB_NOEXCEPT;
310 static uint32_t __vectorcall GetUint32FromLane(f128arg value)
NLIB_NOEXCEPT;
311 static float __vectorcall GetFloatByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
312 static uint32_t __vectorcall GetUint32ByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT;
315 static f128 __vectorcall SetFloatToLane(f128arg value,
float v)
NLIB_NOEXCEPT;
316 static f128 __vectorcall SetFloatByIndex(f128arg value,
float v,
size_t i)
NLIB_NOEXCEPT;
322 template <
int V0,
int V1,
int V2,
int V3>
323 static f128 __vectorcall Swizzle(f128arg value)
NLIB_NOEXCEPT;
324 template <
int V0,
int V1,
int V2,
int V3>
325 static f128 __vectorcall Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT;
326 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
327 static f128 __vectorcall Splat(f128arg value, f128arg splat)
NLIB_NOEXCEPT;
331 static f128 __vectorcall
RotateLeft(f128arg value) NLIB_NOEXCEPT {
333 const size_t NN = 4 - N;
334 return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
338 static f128 __vectorcall
RotateRight(f128arg value) NLIB_NOEXCEPT {
340 return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
344 static f128 __vectorcall
ShiftRight(f128arg a, f128arg b) NLIB_NOEXCEPT {
346 return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
400 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 401 #define NLIB_M2(tp) inline tp __vectorcall 405 #ifdef NLIB_F128_SIMD_NOUSE 412 #elif defined(NLIB_SSE41) 413 return _mm_set1_ps(v);
414 #elif defined(NLIB_NEON) 415 return vdupq_n_f32(v);
418 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
425 #ifdef NLIB_F128_SIMD_NOUSE 432 #elif defined(NLIB_SSE41) 438 return _mm_set1_ps(tmp.f32);
439 #elif defined(NLIB_NEON) 440 uint32x4_t tmp = vdupq_n_u32(v);
441 return vreinterpretq_f32_u32(tmp);
449 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
455 NLIB_M(f128) F128::SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT {
456 #ifdef NLIB_F128_SIMD_NOUSE 463 #elif defined(NLIB_SSE41) 464 return _mm_set_ps(d, c, b, a);
465 #elif defined(NLIB_NEON) 474 return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
477 ret.vec.ps[0][0] = a;
478 ret.vec.ps[0][1] = b;
479 ret.vec.ps[1][0] = c;
480 ret.vec.ps[1][1] = d;
489 #ifdef NLIB_F128_SIMD_NOUSE 491 ret.vec.v[0] = value.vec.v[N];
492 ret.vec.v[1] = value.vec.v[N];
493 ret.vec.v[2] = value.vec.v[N];
494 ret.vec.v[3] = value.vec.v[N];
496 #elif defined(NLIB_SSE41) 497 return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
498 #elif defined(NLIB_NEON) 499 float32x2_t tmp = vget_low_f32(value);
500 return vdupq_lane_f32(tmp, N);
503 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
508 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 511 float32x2_t tmp = vget_high_f32(value);
512 return vdupq_lane_f32(tmp, 0);
516 float32x2_t tmp = vget_high_f32(value);
517 return vdupq_lane_f32(tmp, 1);
519 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 523 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
529 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
535 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
541 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
548 #ifdef NLIB_F128_SIMD_NOUSE 555 #elif defined(NLIB_SSE41) 556 return _mm_setzero_ps();
557 #elif defined(NLIB_NEON) 558 return vdupq_n_f32(0);
561 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
567 #ifdef NLIB_F128_SIMD_NOUSE 574 #elif defined(NLIB_NEON) 575 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
576 float32x2_t x00 = vcreate_f32(0ULL);
577 return vcombine_f32(x10, x00);
579 return F128::LoadA16(F128::v1000_);
584 #ifdef NLIB_F128_SIMD_NOUSE 591 #elif defined(NLIB_NEON) 592 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
593 float32x2_t x00 = vcreate_f32(0ULL);
594 return vcombine_f32(x01, x00);
596 return F128::LoadA16(F128::v0100_);
601 #ifdef NLIB_F128_SIMD_NOUSE 608 #elif defined(NLIB_NEON) 609 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
610 float32x2_t x00 = vcreate_f32(0ULL);
611 return vcombine_f32(x00, x10);
613 return F128::LoadA16(F128::v0010_);
618 #ifdef NLIB_F128_SIMD_NOUSE 625 #elif defined(NLIB_NEON) 626 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
627 float32x2_t x00 = vcreate_f32(0ULL);
628 return vcombine_f32(x00, x01);
630 return F128::LoadA16(F128::v0001_);
636 NLIB_M(f128) F128::SetZeroToLane(f128arg value)
NLIB_NOEXCEPT {
638 #ifdef NLIB_F128_SIMD_NOUSE 642 #elif defined(NLIB_SSE41) 643 return _mm_insert_ps(value, value, 1 << N);
644 #elif defined(NLIB_NEON) 645 return F128::Permute<N == 0 ? 4 : 0,
648 N == 3 ? 7 : 3>(value, vdupq_n_f32(0.f));
652 ret.vec.ps[N / 2][N % 2] = 0.f;
659 return F128::SetValue(1.f, each_float);
664 return F128::SetValue(-1.f, each_float);
669 return F128::SetValue(1.0e-7f, each_float);
684 return F128::SetValue(-0.f, each_float);
689 #ifdef NLIB_F128_SIMD_NOUSE 696 #elif defined(NLIB_SSE41) 697 return _mm_load_ps(p);
698 #elif defined(NLIB_NEON) 699 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
700 uint64x2_t val = vld1q_u64(tmp);
701 return vreinterpretq_f32_u64(val);
704 ret.vec.ps[0][0] = p[0];
705 ret.vec.ps[0][1] = p[1];
706 ret.vec.ps[1][0] = p[2];
707 ret.vec.ps[1][1] = p[3];
714 #ifdef NLIB_F128_SIMD_NOUSE 716 #elif defined(NLIB_SSE41) 717 return _mm_loadu_ps(p);
718 #elif defined(NLIB_NEON) 722 ret.vec.ps[0][0] = p[0];
723 ret.vec.ps[0][1] = p[1];
724 ret.vec.ps[1][0] = p[2];
725 ret.vec.ps[1][1] = p[3];
732 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 733 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
734 uint64x2_t val = vld1q_u64(tmp);
735 return vreinterpretq_f32_u64(val);
743 return LoadA16(reinterpret_cast<const float*>(p));
748 return LoadA8(reinterpret_cast<const float*>(p));
753 return LoadA4(reinterpret_cast<const float*>(p));
758 return LoadA16(reinterpret_cast<const float*>(p));
763 return LoadA8(reinterpret_cast<const float*>(p));
768 return LoadA4(reinterpret_cast<const float*>(p));
772 NLIB_M(
void) F128::StoreA16(
float* p, f128arg value)
NLIB_NOEXCEPT {
773 #ifdef NLIB_F128_SIMD_NOUSE 774 p[0] = value.vec.v[0];
775 p[1] = value.vec.v[1];
776 p[2] = value.vec.v[2];
777 p[3] = value.vec.v[3];
778 #elif defined(NLIB_SSE41) 779 _mm_store_ps(p, value);
780 #elif defined(NLIB_NEON) 781 uint64x2_t tmp = vreinterpretq_u64_f32(value);
782 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
784 p[0] = value.vec.ps[0][0];
785 p[1] = value.vec.ps[0][1];
786 p[2] = value.vec.ps[1][0];
787 p[3] = value.vec.ps[1][1];
792 NLIB_M(
void) F128::StoreA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
793 #ifdef NLIB_F128_SIMD_NOUSE 795 #elif defined(NLIB_SSE41) 796 _mm_storeu_ps(p, value);
797 #elif defined(NLIB_NEON) 800 p[0] = value.vec.ps[0][0];
801 p[1] = value.vec.ps[0][1];
802 p[2] = value.vec.ps[1][0];
803 p[3] = value.vec.ps[1][1];
808 NLIB_M(
void) F128::StoreA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
809 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 810 uint64x2_t tmp = vreinterpretq_u64_f32(value);
811 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
818 NLIB_M(
void) F128::StoreA16(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
819 StoreA16(reinterpret_cast<float*>(p), value);
823 NLIB_M(
void) F128::StoreA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
824 StoreA8(reinterpret_cast<float*>(p), value);
828 NLIB_M(
void) F128::StoreA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
829 StoreA4(reinterpret_cast<float*>(p), value);
833 NLIB_M(
void) F128::StoreA16(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
834 StoreA16(reinterpret_cast<float*>(p), value);
838 NLIB_M(
void) F128::StoreA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
839 StoreA8(reinterpret_cast<float*>(p), value);
843 NLIB_M(
void) F128::StoreA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
844 StoreA4(reinterpret_cast<float*>(p), value);
848 NLIB_M(
void) F128::StoreLoA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
849 #ifdef NLIB_F128_SIMD_NOUSE 850 p[0] = value.vec.v[0];
851 p[1] = value.vec.v[1];
852 #elif defined(NLIB_SSE41) 853 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
854 #elif defined(NLIB_NEON) 855 uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
856 vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
858 p[0] = value.vec.ps[0][0];
859 p[1] = value.vec.ps[0][1];
864 NLIB_M(
void) F128::StoreLoA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
865 #ifdef NLIB_F128_SIMD_NOUSE 866 p[0] = value.vec.v[0];
867 p[1] = value.vec.v[1];
868 #elif defined(NLIB_SSE41) 869 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
870 #elif defined(NLIB_NEON) 871 float32x2_t tmp = vget_low_f32(value);
874 p[0] = value.vec.ps[0][0];
875 p[1] = value.vec.ps[0][1];
880 NLIB_M(
void) F128::StoreLoA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
881 StoreLoA8(reinterpret_cast<float*>(p), value);
885 NLIB_M(
void) F128::StoreLoA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
886 StoreLoA4(reinterpret_cast<float*>(p), value);
890 NLIB_M(
void) F128::StoreLoA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
891 StoreLoA8(reinterpret_cast<float*>(p), value);
895 NLIB_M(
void) F128::StoreLoA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
896 StoreLoA4(reinterpret_cast<float*>(p), value);
900 NLIB_M(
void) F128::StoreHiA8(
float* p, f128arg value)
NLIB_NOEXCEPT {
901 #ifdef NLIB_F128_SIMD_NOUSE 902 p[0] = value.vec.v[2];
903 p[1] = value.vec.v[3];
904 #elif defined(NLIB_SSE41) 905 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
906 #elif defined(NLIB_NEON) 907 vst1_f32(p, vget_high_f32(value));
909 p[0] = value.vec.ps[1][0];
910 p[1] = value.vec.ps[1][1];
915 NLIB_M(
void) F128::StoreHiA4(
float* p, f128arg value)
NLIB_NOEXCEPT {
916 #ifdef NLIB_F128_SIMD_NOUSE 917 p[0] = value.vec.v[2];
918 p[1] = value.vec.v[3];
919 #elif defined(NLIB_SSE41) 920 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
921 #elif defined(NLIB_NEON) 922 float32x2_t tmp = vget_high_f32(value);
925 p[0] = value.vec.ps[1][0];
926 p[1] = value.vec.ps[1][1];
931 NLIB_M(
void) F128::StoreHiA8(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
932 StoreHiA8(reinterpret_cast<float*>(p), value);
936 NLIB_M(
void) F128::StoreHiA4(uintptr_t p, f128arg value)
NLIB_NOEXCEPT {
937 StoreHiA4(reinterpret_cast<float*>(p), value);
941 NLIB_M(
void) F128::StoreHiA8(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
942 StoreHiA8(reinterpret_cast<float*>(p), value);
946 NLIB_M(
void) F128::StoreHiA4(intptr_t p, f128arg value)
NLIB_NOEXCEPT {
947 StoreHiA4(reinterpret_cast<float*>(p), value);
952 #ifdef NLIB_F128_SIMD_NOUSE 954 ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
955 ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
956 ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
957 ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
959 #elif defined(NLIB_NEON) 960 return vabsq_f32(value);
961 #elif defined(NLIB_SSE41) 962 const __m128 signmask = _mm_set1_ps(-0.0f);
963 return _mm_andnot_ps(signmask, value);
966 ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
967 ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
973 NLIB_M(f128) F128::Select(f128arg mask, f128arg a, f128arg b)
NLIB_NOEXCEPT {
974 #ifdef NLIB_F128_SIMD_NOUSE 976 result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
977 result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
978 result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
979 result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
981 #elif defined(NLIB_SSE41) 982 return _mm_blendv_ps(b, a, mask);
983 #elif defined(NLIB_NEON) 984 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
988 mask_.vec.u[0] &= 0xFF7FFFFFUL;
989 mask_.vec.u[1] &= 0xFF7FFFFFUL;
990 mask_.vec.u[2] &= 0xFF7FFFFFUL;
991 mask_.vec.u[3] &= 0xFF7FFFFFUL;
994 ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
995 ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
1047 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 1050 #if defined(NLIB_SSE41) 1051 return _mm_cvtepi32_ps(value);
1052 #elif defined(NLIB_NEON) 1053 return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1059 #if defined(NLIB_SSE41) 1060 return _mm_castsi128_ps(value);
1061 #elif defined(NLIB_NEON) 1062 return vreinterpretq_f32_s8(value);
1068 #if defined(NLIB_SSE41) 1069 return _mm_cvtps_epi32(value);
1070 #elif defined(NLIB_NEON) 1071 uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1072 uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1073 uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1074 w = vorrq_u32(w, half);
1075 return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1080 #if defined(NLIB_SSE41) 1081 return _mm_cvttps_epi32(value);
1082 #elif defined(NLIB_NEON) 1083 return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1089 #if defined(NLIB_SSE41) 1090 return _mm_castps_si128(value);
1091 #elif defined(NLIB_NEON) 1092 return vreinterpretq_s8_f32(value);
1097 NLIB_M(f128) F128::ConvertFromFixedPoint(i128arg value)
NLIB_NOEXCEPT {
1099 #if defined(NLIB_NEON) 1100 return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1102 f128 f = F128::ConvertFromI128(value);
1103 f128 m = F128::SetValue(((0x7F - N) << 23),
each_uint32);
1104 return F128::Mult(f, m);
1111 #if defined(NLIB_NEON) 1112 return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1114 f128 m = F128::SetValue(((0x7F + N) << 23),
each_uint32);
1115 f128 f = F128::Mult(value, m);
1116 return F128::ConvertToI128Truncate(f);
1123 NLIB_M(f128) F128::CmpLt(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1124 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1126 ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1127 ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1128 ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1129 ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1131 #elif defined(NLIB_SSE41) 1132 return _mm_cmplt_ps(a, b);
1133 #elif defined(NLIB_NEON) 1134 uint32x4_t tmp = vcltq_f32(a, b);
1135 return vreinterpretq_f32_u32(tmp);
1140 NLIB_M(f128) F128::CmpLe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1141 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1143 ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1144 ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1145 ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1146 ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1148 #elif defined(NLIB_SSE41) 1149 return _mm_cmple_ps(a, b);
1150 #elif defined(NLIB_NEON) 1151 uint32x4_t tmp = vcleq_f32(a, b);
1152 return vreinterpretq_f32_u32(tmp);
1157 NLIB_M(f128) F128::CmpGt(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1158 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1160 ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1161 ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1162 ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1163 ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1165 #elif defined(NLIB_SSE41) 1166 return _mm_cmpgt_ps(a, b);
1167 #elif defined(NLIB_NEON) 1168 uint32x4_t tmp = vcgtq_f32(a, b);
1169 return vreinterpretq_f32_u32(tmp);
1174 NLIB_M(f128) F128::CmpGe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1175 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1177 ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1178 ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1179 ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1180 ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1182 #elif defined(NLIB_SSE41) 1183 return _mm_cmpge_ps(a, b);
1184 #elif defined(NLIB_NEON) 1185 uint32x4_t tmp = vcgeq_f32(a, b);
1186 return vreinterpretq_f32_u32(tmp);
1191 NLIB_M(f128) F128::CmpNe(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1192 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1194 ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1195 ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1196 ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1197 ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1199 #elif defined(NLIB_SSE41) 1200 return _mm_cmpneq_ps(a, b);
1201 #elif defined(NLIB_NEON) 1202 uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1203 return vreinterpretq_f32_u32(tmp);
1208 NLIB_M(f128) F128::Add(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1209 #ifdef NLIB_F128_SIMD_NOUSE 1211 ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1212 ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1213 ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1214 ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1216 #elif defined(NLIB_SSE41) 1217 return _mm_add_ps(a, b);
1218 #elif defined(NLIB_NEON) 1219 return vaddq_f32(a, b);
1222 ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1223 ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1229 NLIB_M(f128) F128::Sub(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1230 #ifdef NLIB_F128_SIMD_NOUSE 1232 ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1233 ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1234 ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1235 ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1237 #elif defined(NLIB_SSE41) 1238 return _mm_sub_ps(a, b);
1239 #elif defined(NLIB_NEON) 1240 return vsubq_f32(a, b);
1243 ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1244 ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1251 #ifdef NLIB_F128_SIMD_NOUSE 1253 ret.vec.v[0] = -value.vec.v[0];
1254 ret.vec.v[1] = -value.vec.v[1];
1255 ret.vec.v[2] = -value.vec.v[2];
1256 ret.vec.v[3] = -value.vec.v[3];
1258 #elif defined(NLIB_NEON) 1259 return vnegq_f32(value);
1260 #elif defined(NLIB_SSE41) 1261 const __m128 signmask = _mm_set1_ps(-0.0f);
1262 return _mm_xor_ps(signmask, value);
1265 ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1266 ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1272 NLIB_M(f128) F128::Mult(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1273 #ifdef NLIB_F128_SIMD_NOUSE 1275 ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1276 ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1277 ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1278 ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1280 #elif defined(NLIB_SSE41) 1281 return _mm_mul_ps(a, b);
1282 #elif defined(NLIB_NEON) 1283 return vmulq_f32(a, b);
1286 ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1287 ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1294 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1295 return vmulq_n_f32(b, a);
1296 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1298 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1299 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1302 return F128::Mult(b, F128::SetValue(a, each_float));
1309 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1311 return vmulq_laneq_f32(b, a, N);
1313 float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1314 return vmulq_n_f32(b, tmp);
1316 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1317 float t = a.vec.ps[N / 2][N % 2];
1319 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1320 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1328 NLIB_M(f128) F128::Div(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1329 #ifdef NLIB_F128_SIMD_NOUSE 1331 ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1332 ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1333 ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1334 ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1336 #elif defined(NLIB_SSE41) 1337 return _mm_div_ps(a, b);
1338 #elif defined(NLIB_NEON) 1340 return vdivq_f32(a, b);
1342 float32x4_t inv0 = vrecpeq_f32(b);
1343 float32x4_t step0 = vrecpsq_f32(inv0, b);
1344 float32x4_t inv1 = vmulq_f32(step0, inv0);
1345 float32x4_t step1 = vrecpsq_f32(inv1, b);
1346 float32x4_t inv2 = vmulq_f32(step1, inv1);
1347 uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1348 inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1349 return vmulq_f32(a, inv2);
1353 ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1354 ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1360 NLIB_M(f128) F128::Max(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1361 #ifdef NLIB_F128_SIMD_NOUSE 1363 ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1364 ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1365 ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1366 ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1368 #elif defined(NLIB_SSE41) 1369 return _mm_max_ps(a, b);
1370 #elif defined(NLIB_NEON) 1371 return vmaxq_f32(a, b);
1373 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1374 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1376 ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1377 ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1383 NLIB_M(f128) F128::Min(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1384 #ifdef NLIB_F128_SIMD_NOUSE 1386 ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1387 ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1388 ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1389 ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1391 #elif defined(NLIB_SSE41) 1392 return _mm_min_ps(a, b);
1393 #elif defined(NLIB_NEON) 1394 return vminq_f32(a, b);
1396 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1397 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1399 ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1400 ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1406 NLIB_M(f128) F128::PairwiseMax(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1407 #ifdef NLIB_F128_SIMD_NOUSE 1409 ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1410 ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1411 ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1412 ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1414 #elif defined(NLIB_SSE41) 1415 f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1416 f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1417 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1418 #elif defined(NLIB_NEON) 1420 return vpmaxq_f32(a, b);
1422 float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1423 float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1424 return vcombine_f32(rl, rh);
1427 f32x2 v02, v13, cmp;
1429 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1430 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1431 cmp = __PS_SUB(v02, v13);
1432 ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1433 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1434 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1435 cmp = __PS_SUB(v02, v13);
1436 ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1442 NLIB_M(f128) F128::PairwiseMin(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1443 #ifdef NLIB_F128_SIMD_NOUSE 1445 ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1446 ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1447 ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1448 ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1450 #elif defined(NLIB_SSE41) 1451 f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1452 f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1453 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1454 #elif defined(NLIB_NEON) 1456 return vpminq_f32(a, b);
1458 float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1459 float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1460 return vcombine_f32(rl, rh);
1463 f32x2 v02, v13, cmp;
1465 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1466 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1467 cmp = __PS_SUB(v02, v13);
1468 ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1469 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1470 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1471 cmp = __PS_SUB(v02, v13);
1472 ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1478 NLIB_M(f128) F128::PairwiseAdd(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1479 #ifdef NLIB_F128_SIMD_NOUSE 1481 ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1482 ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1483 ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1484 ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1486 #elif defined(NLIB_SSE41) 1487 return _mm_hadd_ps(a, b);
1488 #elif defined(NLIB_NEON) 1490 return vpaddq_f32(a, b);
1492 float32x2_t al = vget_low_f32(a);
1493 float32x2_t ah = vget_high_f32(a);
1494 float32x2_t l = vpadd_f32(al, ah);
1496 float32x2_t bl = vget_low_f32(b);
1497 float32x2_t bh = vget_high_f32(b);
1498 float32x2_t h = vpadd_f32(bl, bh);
1499 return vcombine_f32(l, h);
1502 f32x2 v02, v13, cmp;
1504 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1505 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1506 ret.vec.ps[0] = __PS_ADD(v02, v13);
1507 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1508 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1509 ret.vec.ps[1] = __PS_ADD(v02, v13);
1515 NLIB_M(f128) F128::AbsDiff(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1516 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1517 return vabdq_f32(a, b);
1519 return F128::Abs(F128::Sub(a, b));
1524 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1525 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1527 return vfmaq_f32(c, a, b);
1529 return vmlaq_f32(c, a, b);
1531 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1533 ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1534 ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1537 return F128::Add(c, F128::Mult(a, b));
1542 NLIB_M(f128) F128::MultAdd(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1543 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1545 return vfmaq_n_f32(c, b, a);
1547 return vmlaq_n_f32(c, b, a);
1550 return F128::MultAdd(F128::SetValue(a, each_float), b, c);
1556 NLIB_M(f128) F128::MultAdd(f128arg a, f128arg b, f128arg c,
1559 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1561 return vfmaq_laneq_f32(c, b, a, N);
1563 return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1566 return F128::MultAdd(F128::SetValue<N>(a,
each_select32), b, c);
1571 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1572 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1574 return vfmsq_f32(c, a, b);
1576 return vmlsq_f32(c, a, b);
1578 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1580 ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1581 ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1584 return F128::Sub(c, F128::Mult(a, b));
1589 NLIB_M(f128) F128::MultSub(
float a, f128arg b, f128arg c)
NLIB_NOEXCEPT {
1590 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1592 return vfmsq_n_f32(c, b, a);
1594 return vmlsq_n_f32(c, b, a);
1597 return F128::MultSub(F128::SetValue(a, each_float), b, c);
1603 NLIB_M(f128) F128::MultSub(f128arg a, f128arg b, f128arg c,
1606 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1608 return vfmsq_laneq_f32(c, b, a, N);
1610 return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1613 return F128::MultSub(F128::SetValue<N>(a,
each_select32), b, c);
1618 NLIB_M(f128) F128::Lerp(f128arg a, f128arg b, f128arg t)
NLIB_NOEXCEPT {
1620 return F128::MultAdd(t, F128::Sub(b, a), a);
1624 NLIB_M(f128) F128::And(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1625 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1627 ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1628 ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1629 ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1630 ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1632 #elif defined(NLIB_SSE41) 1633 return _mm_and_ps(a, b);
1634 #elif defined(NLIB_NEON) 1635 uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1636 return vreinterpretq_f32_u32(tmp);
1641 NLIB_M2(f128) F128::AddAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT {
1644 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1647 f128 sum = F128::Add(angle1, angle2);
1648 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1649 f128 ofs = F128::And(cond, pi_dbl);
1650 f128 result = F128::Add(sum, ofs);
1651 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1652 ofs = F128::And(cond, pi_dbl);
1653 return F128::Sub(result, ofs);
1657 NLIB_M2(f128) F128::SubAngle(f128arg angle1, f128arg angle2)
NLIB_NOEXCEPT {
1660 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1663 f128 sum = F128::Sub(angle1, angle2);
1664 f128 cond = F128::CmpLt(sum, F128::SetValue<1>(pi_pi2,
each_select32));
1665 f128 ofs = F128::And(cond, pi_dbl);
1666 f128 result = F128::Add(sum, ofs);
1667 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1668 ofs = F128::And(cond, pi_dbl);
1669 return F128::Sub(result, ofs);
1676 NLIB_M2(f128) F128::Hermite(f128arg p0, f128arg v0, f128arg p1, f128arg_ex v1,
1683 f128 tt = F128::Mult(t, t);
1684 f128 ttt = F128::Mult(tt, t);
1685 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1686 f128 hermite_R0 = vcombine_f32(vcreate_f32(0x3F80000040000000ULL),
1687 vcreate_f32(0x3F800000C0000000ULL));
1688 f128 hermite_R1 = vcombine_f32(vcreate_f32(0xC0000000C0400000ULL),
1689 vcreate_f32(0xBF80000040400000ULL));
1691 f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1692 f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1695 ttt = F128::Mult(ttt, hermite_R0);
1696 ttt = F128::MultAdd(tt, hermite_R1, ttt);
1697 ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1698 ttt = F128::Add(ttt, F128::Set1000());
1712 NLIB_M2(f128) F128::CatmullRom(f128arg p0, f128arg p1, f128arg p2, f128arg_ex p3,
1714 f128 tt = F128::Mult(t, t);
1715 f128 ttt = F128::Mult(tt, t);
1716 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1717 f128 catmull_R0 = vcombine_f32(vcreate_f32(0x40400000BF800000ULL),
1718 vcreate_f32(0x3F800000C0400000ULL));
1719 f128 catmull_R1 = vcombine_f32(vcreate_f32(0xC0A0000040000000ULL),
1720 vcreate_f32(0xBF80000040800000ULL));
1721 f128 catmull_R2 = vcombine_f32(vcreate_f32(0x00000000BF800000ULL),
1722 vcreate_f32(0x000000003F800000ULL));
1724 f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1725 f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1726 f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1728 ttt = F128::Mult(ttt, catmull_R0);
1729 ttt = F128::MultAdd(tt, catmull_R1, ttt);
1730 ttt = F128::MultAdd(t, catmull_R2, ttt);
1731 ttt = F128::Add(ttt, F128::Set0100());
1743 NLIB_M(f128) F128::BaryCentric(f128arg p0, f128arg p1, f128arg p2, f128arg_ex f,
1745 f128 p1p0 = F128::Sub(p1, p0);
1746 f128 p2p0 = F128::Sub(p2, p0);
1747 f128 tmp = F128::MultAdd(f, p1p0, p0);
1748 return F128::MultAdd(g, p2p0, tmp);
1753 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1755 ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1756 ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1757 ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1758 ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1760 #elif defined(NLIB_SSE41) 1761 return _mm_or_ps(a, b);
1762 #elif defined(NLIB_NEON) 1763 uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1764 return vreinterpretq_f32_u32(tmp);
1769 NLIB_M(f128) F128::Xor(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1770 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1772 ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1773 ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1774 ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1775 ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1777 #elif defined(NLIB_SSE41) 1778 return _mm_xor_ps(a, b);
1779 #elif defined(NLIB_NEON) 1780 uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1781 return vreinterpretq_f32_u32(tmp);
1787 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1789 ret.vec.u[0] = ~a.vec.u[0];
1790 ret.vec.u[1] = ~a.vec.u[1];
1791 ret.vec.u[2] = ~a.vec.u[2];
1792 ret.vec.u[3] = ~a.vec.u[3];
1794 #elif defined(NLIB_SSE41) 1795 return _mm_andnot_ps(a, F128::CmpEq(a, a));
1796 #elif defined(NLIB_NEON) 1797 uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1798 return vreinterpretq_f32_u32(tmp);
1803 NLIB_M(f128) F128::AndNot(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1804 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1806 ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1807 ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1808 ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1809 ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1811 #elif defined(NLIB_SSE41) 1812 return _mm_andnot_ps(a, b);
1813 #elif defined(NLIB_NEON) 1814 uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1815 return vreinterpretq_f32_u32(tmp);
1820 NLIB_M(f128) F128::OrNot(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1821 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1823 ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1824 ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1825 ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1826 ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1828 #elif defined(NLIB_SSE41) 1829 return _mm_or_ps(F128::Not(a), b);
1830 #elif defined(NLIB_NEON) 1831 uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1832 return vreinterpretq_f32_u32(tmp);
1837 NLIB_M(f128) F128::CmpEq(f128arg a, f128arg b)
NLIB_NOEXCEPT {
1838 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1840 ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1841 ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1842 ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1843 ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1845 #elif defined(NLIB_SSE41) 1846 return _mm_cmpeq_ps(a, b);
1847 #elif defined(NLIB_NEON) 1848 uint32x4_t tmp = vceqq_f32(a, b);
1849 return vreinterpretq_f32_u32(tmp);
1854 NLIB_M(f128) F128::CmpNearEq(f128arg a, f128arg b, f128arg eps)
NLIB_NOEXCEPT {
1855 f128 tmp = F128::AbsDiff(a, b);
1856 return F128::CmpLe(tmp, eps);
1860 NLIB_M(f128) F128::Clamp(f128arg value, f128arg min, f128arg max)
NLIB_NOEXCEPT {
1861 return F128::Min(max, F128::Max(min, value));
1865 NLIB_M(f128) F128::InBound(f128arg value, f128arg bounds)
NLIB_NOEXCEPT {
1866 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1867 uint32x4_t tmp = vcaleq_f32(value, bounds);
1868 return vreinterpretq_f32_u32(tmp);
1870 return F128::CmpLe(F128::Abs(value), bounds);
1875 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1876 return vreinterpretq_f32_u32(vceqzq_f32(value));
1878 return F128::CmpEq(value, F128::SetZero());
1883 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1884 return vreinterpretq_f32_u32(vcltzq_f32(value));
1886 return F128::CmpLt(value, F128::SetZero());
1891 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1892 return vreinterpretq_f32_u32(vclezq_f32(value));
1894 return F128::CmpLe(value, F128::SetZero());
1899 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1900 return vreinterpretq_f32_u32(vcgtzq_f32(value));
1902 return F128::CmpGt(value, F128::SetZero());
1907 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1908 return vreinterpretq_f32_u32(vcgezq_f32(value));
1910 return F128::CmpGe(value, F128::SetZero());
1915 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1916 return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1918 return F128::CmpNe(value, F128::SetZero());
1923 NLIB_M(f128) F128::CmpNearEqZero(f128arg value, f128arg eps)
NLIB_NOEXCEPT {
1924 f128 tmp = F128::Abs(value);
1925 return F128::CmpLe(tmp, eps);
1930 #ifdef NLIB_F128_SIMD_NOUSE 1932 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1933 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1934 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1935 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1937 #elif defined(NLIB_SSE41) 1938 return _mm_div_ps(F128::SetOne(), value);
1939 #elif defined(NLIB_NEON) 1941 return vdivq_f32(vdupq_n_f32(1.f), value);
1944 x = vrecpeq_f32(value);
1945 x = vmulq_f32(x, vrecpsq_f32(x, value));
1946 x = vmulq_f32(x, vrecpsq_f32(x, value));
1947 uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1948 float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1952 return F128::Div(F128::SetOne(), value);
1958 #ifdef NLIB_F128_SIMD_NOUSE 1960 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1961 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1962 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1963 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1965 #elif defined(NLIB_SSE41) 1966 return _mm_rcp_ps(value);
1967 #elif defined(NLIB_NEON) 1968 return vrecpeq_f32(value);
1971 ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1972 ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1979 #ifdef NLIB_F128_SIMD_NOUSE 1981 ret.vec.v[0] = sqrtf(value.vec.v[0]);
1982 ret.vec.v[1] = sqrtf(value.vec.v[1]);
1983 ret.vec.v[2] = sqrtf(value.vec.v[2]);
1984 ret.vec.v[3] = sqrtf(value.vec.v[3]);
1986 #elif defined(NLIB_SSE41) 1987 return _mm_sqrt_ps(value);
1988 #elif defined(NLIB_NEON) 1989 f128 iszero = F128::CmpEqZero(value);
1990 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1991 return F128::AndNot(iszero, result);
1993 f128 zero = F128::SetZero();
1994 f128 iszero = F128::CmpEq(zero, value);
1995 f128 result = F128::Mult(value, F128::RecpSqrt(value));
1996 return F128::Select(iszero, zero, result);
2002 #ifdef NLIB_F128_SIMD_NOUSE 2004 ret.vec.v[0] = sqrtf(value.vec.v[0]);
2005 ret.vec.v[1] = sqrtf(value.vec.v[1]);
2006 ret.vec.v[2] = sqrtf(value.vec.v[2]);
2007 ret.vec.v[3] = sqrtf(value.vec.v[3]);
2009 #elif defined(NLIB_SSE41) 2010 return _mm_sqrt_ps(value);
2011 #elif defined(NLIB_NEON) 2012 return vrecpeq_f32(vrsqrteq_f32(value));
2015 ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2016 ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2023 #ifdef NLIB_F128_SIMD_NOUSE 2025 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2026 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2027 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2028 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2030 #elif defined(NLIB_SSE41) 2031 return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2032 #elif defined(NLIB_NEON) 2034 x = vrsqrteq_f32(value);
2035 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2036 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2037 f128 zeromask = F128::CmpEqZero(value);
2038 return F128::Select(zeromask, F128::SetInfinity(), x);
2040 f32x2 three = __PS_FDUP(3.f);
2041 f32x2 half = __PS_FDUP(0.5f);
2047 v = value.vec.ps[0];
2050 xx = __PS_MUL(x, x);
2051 xx = __PS_NMSUB(v, xx, three);
2052 xx = __PS_MUL(x, xx);
2053 x = __PS_MUL(half, xx);
2055 xx = __PS_MUL(x, x);
2056 xx = __PS_NMSUB(v, xx, three);
2057 xx = __PS_MUL(x, xx);
2058 ret.vec.ps[0] = __PS_MUL(half, xx);
2060 v = value.vec.ps[1];
2063 xx = __PS_MUL(x, x);
2064 xx = __PS_NMSUB(v, xx, three);
2065 xx = __PS_MUL(x, xx);
2066 x = __PS_MUL(half, xx);
2068 xx = __PS_MUL(x, x);
2069 xx = __PS_NMSUB(v, xx, three);
2070 xx = __PS_MUL(x, xx);
2071 ret.vec.ps[1] = __PS_MUL(half, xx);
2073 f128 iszero = F128::CmpEq(F128::SetZero(), value);
2074 f128 inf = F128::SetInfinity();
2075 return F128::Select(iszero, inf, ret);
2080 NLIB_M(f128) F128::RecpSqrtEst(f128arg value)
NLIB_NOEXCEPT {
2081 #ifdef NLIB_F128_SIMD_NOUSE 2083 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2084 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2085 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2086 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2088 #elif defined(NLIB_SSE41) 2089 return _mm_rsqrt_ps(value);
2090 #elif defined(NLIB_NEON) 2091 return vrsqrteq_f32(value);
2094 ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2095 ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2100 template <
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
2102 const size_t lane0 = NegateLane0 ? 4 : 0;
2103 const size_t lane1 = NegateLane1 ? 5 : 1;
2104 const size_t lane2 = NegateLane2 ? 6 : 2;
2105 const size_t lane3 = NegateLane3 ? 7 : 3;
2106 return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2110 NLIB_M(f128) F128::NegateEx<false, false, false, false>(f128arg value)
NLIB_NOEXCEPT {
2115 NLIB_M(f128) F128::NegateEx<true, true, true, true>(f128arg value)
NLIB_NOEXCEPT {
2116 return F128::Negate(value);
2119 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2120 #define NLIB_ISNAN(vec, idx) \ 2121 ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0) 2122 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U) 2127 #if defined(NLIB_F128_SIMD_NOUSE) 2129 ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2130 ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2131 ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2132 ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2136 f32x2 one = __PS_FDUP(1.f);
2137 f32x2 minus_one = __PS_NEG(one);
2138 f32x2 v0 = value.vec.ps[0];
2139 f32x2 v1 = value.vec.ps[1];
2140 f32x2 t0 = __PS_SEL(v0, one, minus_one);
2141 f32x2 t1 = __PS_SEL(v1, one, minus_one);
2143 f32x2 v0neg = __PS_NEG(v0);
2144 f32x2 v1neg = __PS_NEG(v1);
2145 ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2146 ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2149 return F128::CmpNe(value, value);
2154 NLIB_M(f128) F128::IsInfinite(f128arg value)
NLIB_NOEXCEPT {
2155 #if defined(NLIB_F128_SIMD_NOUSE) 2157 ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2158 ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2159 ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2160 ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2164 f32x2 big_value = __PS_FDUP(FLT_MAX);
2165 ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2166 ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2169 f128 inf_value = F128::SetInfinity();
2170 f128 abs_value = F128::Abs(value);
2171 return F128::CmpEq(inf_value, abs_value);
2177 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 2178 return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2179 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE) 2180 return vrndaq_f32(value);
2183 f128 sgn = F128::And(value, F128::SetSignMask());
2184 f128 sm = F128::Or(F128::SetValue(0x4B000000U,
each_uint32), sgn);
2185 f128 result = F128::Sub(F128::Add(value, sm), sm);
2194 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2196 for (
size_t i = 0; i < 4; ++i) {
2197 if (NLIB_ISNAN(value.vec, i)) {
2198 ret.vec.u[i] = 0x7FC00000U;
2200 ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2201 ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2206 #elif defined(NLIB_SSE41) 2207 return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2208 #elif defined(NLIB_NEON) 2210 f128 x = F128::Abs(value);
2211 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2212 f128 cond = F128::CmpLt(x, c_2_23);
2213 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2214 return F128::Select(cond, casted, value);
2216 return vrndq_f32(value);
2223 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2225 ret.vec.v[0] = floorf(value.vec.v[0]);
2226 ret.vec.v[1] = floorf(value.vec.v[1]);
2227 ret.vec.v[2] = floorf(value.vec.v[2]);
2228 ret.vec.v[3] = floorf(value.vec.v[3]);
2230 #elif defined(NLIB_SSE41) 2231 return _mm_floor_ps(value);
2232 #elif defined(NLIB_NEON) 2236 f128 x = F128::Abs(value);
2237 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2238 f128 cond = F128::CmpLt(x, c_2_23);
2239 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2242 f128 large_mask = F128::CmpGt(casted, value);
2244 casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(large_mask)));
2245 return F128::Select(cond, casted, value);
2247 return vrndmq_f32(value);
2254 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2256 ret.vec.v[0] = ceilf(value.vec.v[0]);
2257 ret.vec.v[1] = ceilf(value.vec.v[1]);
2258 ret.vec.v[2] = ceilf(value.vec.v[2]);
2259 ret.vec.v[3] = ceilf(value.vec.v[3]);
2261 #elif defined(NLIB_SSE41) 2262 return _mm_ceil_ps(value);
2263 #elif defined(NLIB_NEON) 2267 f128 x = F128::Abs(value);
2268 f128 c_2_23 = F128::SetValue(8388608.f, each_float);
2269 f128 cond = F128::CmpLt(x, c_2_23);
2270 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2273 f128 small_mask = F128::CmpLt(casted, value);
2275 casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(small_mask)));
2276 return F128::Select(cond, casted, value);
2278 return vrndpq_f32(value);
2283 #ifdef NLIB_F128_SIMD_NOUSE 2290 return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2294 static const float v_1_2pi = 0.15915494309189535f;
2295 static const float v_2pi = 6.283185307179586f;
2297 const f128 recp_two_pi = F128::SetValue(v_1_2pi, each_float);
2298 f128 round = F128::Round(F128::Mult(value, recp_two_pi));
2299 const f128 two_pi = F128::SetValue(v_2pi, each_float);
2300 return F128::MultSub(two_pi, round, value);
2305 f128 x = F128::ModAngle(value);
2312 f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2316 f128 xabs = F128::Abs(value);
2317 f128 xsign = F128::And(F128::SetSignMask(), x);
2318 f128 mypi = F128::Or(xsign, pi);
2319 f128 pi_x = F128::Sub(mypi, x);
2320 f128 cond = F128::CmpLe(xabs, pidiv2);
2321 x = F128::Select(cond, x, pi_x);
2323 f128 xx = F128::Mult(x, x);
2324 f128 coeff = F128::LoadA16(sin_coeff_);
2328 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2329 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2330 result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue,
each_select32));
2331 result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue,
each_select32));
2332 result = F128::Mult(xx, result);
2333 result = F128::MultSub(result, x, x);
2339 f128 x = F128::ModAngle(value);
2346 f128 cvalue = F128::LoadA16(cos_cvalue_);
2348 f128 xabs = F128::Abs(value);
2349 f128 xsign = F128::And(F128::SetSignMask(), x);
2350 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2351 f128 pi_x = F128::Sub(mypi, x);
2352 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2353 x = F128::Select(cond, x, pi_x);
2356 f128 sign = F128::AndNot(cond, F128::SetSignMask());
2360 f128 xx = F128::Mult(x, x);
2361 f128 coeff = F128::LoadA16(cos_coeff_);
2365 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2366 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2367 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2368 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2369 result = F128::MultSub(xx, result, F128::SetOne());
2370 result = F128::Xor(sign, result);
2376 const f128 signmask = F128::SetSignMask();
2377 f128 x = F128::ModAngle(value);
2384 f128 cvalue = F128::LoadA16(cos_cvalue_);
2386 f128 xabs = F128::Abs(value);
2387 f128 xsign = F128::And(signmask, x);
2388 f128 mypi = F128::Or(xsign, F128::SetValue<0>(cvalue,
each_select32));
2389 f128 pi_x = F128::Sub(mypi, x);
2390 f128 cond = F128::CmpLe(xabs, F128::SetValue<1>(cvalue,
each_select32));
2391 x = F128::Select(cond, x, pi_x);
2394 f128 sign = F128::AndNot(cond, signmask);
2398 f128 xx = F128::Mult(x, x);
2403 f128 coeff = F128::LoadA16(cos_coeff_);
2408 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2409 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2410 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2411 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2412 result = F128::MultSub(xx, result, F128::SetOne());
2414 ret.val[1] = F128::Xor(sign, result);
2419 f128 coeff = F128::LoadA16(sin_coeff_);
2424 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2425 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2426 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2], each_float));
2427 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3], each_float));
2428 result = F128::Mult(xx, result);
2429 ret.val[0] = F128::MultSub(result, x, x);
2438 f128 cmp, value_sign;
2440 f128 one = F128::SetOne();
2445 value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2446 cmp = F128::CmpLe(F128::Abs(value), one);
2448 f128 x = F128::Select(cmp, value, F128::Recp(value));
2455 f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2456 f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2457 f128 xx = F128::Mult(x, x);
2460 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1,
each_select32));
2461 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1,
each_select32));
2462 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0,
each_select32));
2463 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0,
each_select32));
2464 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0,
each_select32));
2465 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0,
each_select32));
2467 result = F128::Mult(result, x);
2468 result = F128::MultSub(xx, result, x);
2470 f128 pi_2 = F128::SetValue(1.5707963267948966f, each_float);
2471 f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2472 result = F128::Select(cmp, result, result_another);
2476 NLIB_M2(f128) F128::ArcTan2(f128arg y, f128arg x)
NLIB_NOEXCEPT {
2493 const f128 signmask = F128::SetSignMask();
2495 const f128 sy = F128::And(y, signmask);
2496 const f128 infx = F128::IsInfinite(x);
2497 const f128 infy = F128::IsInfinite(y);
2498 const f128 zerox = F128::CmpEqZero(x);
2499 const f128 zeroy = F128::CmpEqZero(y);
2500 const f128 posx = F128::CmpGtZero(x);
2510 const f128 cval = F128::LoadA16(atan2_cvalue_);
2511 const f128 pi = F128::Or(sy, F128::SetValue<0>(cval,
each_select32));
2512 const f128 pi_34 = F128::Or(sy, F128::SetValue<1>(cval,
each_select32));
2513 const f128 pi_2 = F128::Or(sy, F128::SetValue<2>(cval,
each_select32));
2514 const f128 pi_4 = F128::Or(sy, F128::SetValue<3>(cval,
each_select32));
2516 f128 v = F128::Select(
2517 infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2518 F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2523 #if defined(NLIB_F128_SIMD_NOUSE) 2525 mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2526 mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2527 mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2528 mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2532 mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2533 mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2534 mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2535 mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2537 f128 mask = F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v),
2540 f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2541 return F128::Select(mask, result, v);
2546 f128 one = F128::SetOne();
2547 f128 tmp = F128::MultSub(value, value, one);
2548 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2549 return F128::ArcTan2(value, argx);
2554 f128 one = F128::SetOne();
2555 f128 tmp = F128::MultSub(value, value, one);
2556 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2557 return F128::ArcTan2(argx, value);
2562 #ifdef NLIB_F128_SIMD_NOUSE 2564 ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2565 ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2566 ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2567 ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2569 #elif defined(NLIB_SSE41) 2570 return static_cast<uint8_t
>(_mm_movemask_ps(value));
2571 #elif defined(NLIB_NEON) 2572 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2573 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2574 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2575 uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2577 return vaddvq_u32(a);
2579 uint16x4_t tmp = vmovn_u32(a);
2580 tmp = vpadd_u16(tmp, tmp);
2581 tmp = vpadd_u16(tmp, tmp);
2582 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2585 int tmp = (value.vec.u[0] >> 31);
2586 tmp |= (value.vec.u[1] >> 30) & 2;
2587 tmp |= (value.vec.u[2] >> 29) & 4;
2588 tmp |= (value.vec.u[3] >> 28) & 8;
2594 NLIB_M2(
bool) F128::IsAllMaskFalse(f128arg value)
NLIB_NOEXCEPT {
2595 #ifdef NLIB_F128_SIMD_NOUSE 2596 return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2597 #elif defined(NLIB_SSE41) 2598 i128 casted = F128::CastToI128(value);
2599 return _mm_testz_si128(casted, casted) != 0;
2600 #elif defined(NLIB_NEON) 2602 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2603 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2605 int32x4_t casted = vreinterpretq_s32_f32(value);
2606 int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2607 return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2610 uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2611 return (tmp & 0x80000000U) == 0;
2616 NLIB_M2(
bool) F128::IsAllMaskTrue(f128arg value)
NLIB_NOEXCEPT {
2617 #ifdef NLIB_F128_SIMD_NOUSE 2618 return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2619 value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2620 #elif defined(NLIB_SSE41) 2621 i128 casted = F128::CastToI128(value);
2622 return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2623 #elif defined(NLIB_NEON) 2625 uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2626 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2628 int32x4_t casted = vreinterpretq_s32_f32(value);
2629 int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2630 return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2633 uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2634 return (tmp & 0x80000000U) != 0;
2640 NLIB_M(
float) F128::GetFloatFromLane(f128arg value)
NLIB_NOEXCEPT {
2642 #ifdef NLIB_F128_SIMD_NOUSE 2643 return value.vec.v[N];
2644 #elif defined(NLIB_SSE41) 2646 _MM_EXTRACT_FLOAT(dest, value, N);
2648 #elif defined(NLIB_NEON) 2649 return vgetq_lane_f32(value, N);
2651 return value.vec.ps[N / 2][N % 2];
2657 NLIB_M(uint32_t) F128::GetUint32FromLane(f128arg value)
NLIB_NOEXCEPT {
2659 #ifdef NLIB_F128_SIMD_NOUSE 2660 return value.vec.u[N];
2661 #elif defined(NLIB_SSE41) 2662 return _mm_extract_ps(value, N);
2663 #elif defined(NLIB_NEON) 2664 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2665 return vgetq_lane_u32(tmp, N);
2667 return value.vec.u[N];
2672 NLIB_M2(
float) F128::GetFloatByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT {
2673 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2674 return value.vec.v[idx];
2675 #elif defined(NLIB_SSE41) 2679 _MM_EXTRACT_FLOAT(dest, value, 0);
2682 _MM_EXTRACT_FLOAT(dest, value, 1);
2685 _MM_EXTRACT_FLOAT(dest, value, 2);
2688 _MM_EXTRACT_FLOAT(dest, value, 3);
2695 #elif defined(NLIB_NEON) 2698 return vgetq_lane_f32(value, 0);
2700 return vgetq_lane_f32(value, 1);
2702 return vgetq_lane_f32(value, 2);
2704 return vgetq_lane_f32(value, 3);
2713 NLIB_M2(uint32_t) F128::GetUint32ByIndex(f128arg value,
size_t idx)
NLIB_NOEXCEPT {
2714 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2715 return value.vec.u[idx];
2716 #elif defined(NLIB_SSE41) 2719 return static_cast<uint32_t
>(_mm_extract_ps(value, 0));
2721 return static_cast<uint32_t
>(_mm_extract_ps(value, 1));
2723 return static_cast<uint32_t
>(_mm_extract_ps(value, 2));
2725 return static_cast<uint32_t
>(_mm_extract_ps(value, 3));
2730 #elif defined(NLIB_NEON) 2731 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2734 return vgetq_lane_u32(tmp, 0);
2736 return vgetq_lane_u32(tmp, 1);
2738 return vgetq_lane_u32(tmp, 2);
2740 return vgetq_lane_u32(tmp, 3);
2750 NLIB_M(f128) F128::SetFloatToLane(f128arg value,
float v)
NLIB_NOEXCEPT {
2752 #ifdef NLIB_F128_SIMD_NOUSE 2756 #elif defined(NLIB_SSE41) 2757 f128 tmp = _mm_set_ss(v);
2758 return _mm_insert_ps(value, tmp, N << 4);
2759 #elif defined(NLIB_NEON) 2760 return __builtin_constant_p(v) ?
2761 F128::Permute<N == 0 ? 4 : 0,
2764 N == 3 ? 7 : 3>(value, vdupq_n_f32(v)) :
2765 vsetq_lane_f32(v, value, N);
2768 ret.vec.ps[N / 2][N % 2] = v;
2774 NLIB_M2(f128) F128::SetFloatByIndex(f128arg value,
float v,
size_t i)
NLIB_NOEXCEPT {
2775 #ifdef NLIB_F128_SIMD_NOUSE 2779 #elif defined(NLIB_SSE41) 2780 f128 tmp = _mm_set_ss(v);
2783 return _mm_insert_ps(value, tmp, 0x00);
2785 return _mm_insert_ps(value, tmp, 0x10);
2787 return _mm_insert_ps(value, tmp, 0x20);
2789 return _mm_insert_ps(value, tmp, 0x30);
2794 #elif defined(NLIB_NEON) 2797 return F128::SetFloatToLane<0>(value, v);
2799 return F128::SetFloatToLane<1>(value, v);
2801 return F128::SetFloatToLane<2>(value, v);
2803 return F128::SetFloatToLane<3>(value, v);
2812 ret.vec.ps[0][0] = v;
2815 ret.vec.ps[0][1] = v;
2818 ret.vec.ps[1][0] = v;
2821 ret.vec.ps[1][1] = v;
2828 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 2831 template <
bool IsHighA,
bool IsHighB>
2832 float32x2_t F64Merge(float32x2_t a, float32x2_t b)
NLIB_NOEXCEPT;
2835 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2837 return vtrn1_f32(a, b);
2839 return vtrn_f32(a, b).val[0];
2844 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, false>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2846 return vtrn1_f32(vrev64_f32(a), b);
2848 return vtrn_f32(vrev64_f32(a), b).val[0];
2853 NLIB_ALWAYS_INLINE float32x2_t F64Merge<false, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2855 return vtrn1_f32(a, vrev64_f32(b));
2857 return vtrn_f32(a, vrev64_f32(b)).val[0];
2862 NLIB_ALWAYS_INLINE float32x2_t F64Merge<true, true>(float32x2_t a, float32x2_t b) NLIB_NOEXCEPT {
2864 return vtrn2_f32(a, b);
2866 return vtrn_f32(a, b).val[1];
2875 return vget_low_f32(value);
2880 return vget_high_f32(value);
2883 template <
int X0,
int X1>
2884 struct F128SwizzleHelper2 {
2886 float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2887 float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2888 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2893 struct F128SwizzleHelper2<X, X> {
2895 float32x2_t x = F128SwizzleGet64<X / 2>(value);
2896 return vdup_lane_f32(x, (X & 1));
2901 struct F128SwizzleHelper2<0, 1> {
2903 return vget_low_f32(value);
2908 struct F128SwizzleHelper2<0, 2> {
2911 return vget_low_f32(vuzp1q_f32(value, value));
2913 float32x2_t lo = vget_low_f32(value);
2914 float32x2_t hi = vget_high_f32(value);
2915 return vzip_f32(lo, hi).val[0];
2921 struct F128SwizzleHelper2<0, 3> {
2923 float32x2_t lo = vget_low_f32(value);
2924 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2926 return vzip1_f32(lo, hi);
2928 return vzip_f32(lo, hi).val[0];
2934 struct F128SwizzleHelper2<1, 0> {
2936 return vrev64_f32(vget_low_f32(value));
2941 struct F128SwizzleHelper2<1, 2> {
2943 float32x2_t lo = vget_low_f32(value);
2944 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2946 return vzip2_f32(lo, hi);
2948 return vzip_f32(lo, hi).val[1];
2954 struct F128SwizzleHelper2<1, 3> {
2957 return vget_low_f32(vuzp2q_f32(value, value));
2959 float32x2_t lo = vget_low_f32(value);
2960 float32x2_t hi = vget_high_f32(value);
2961 return vzip_f32(lo, hi).val[1];
2967 struct F128SwizzleHelper2<2, 0> {
2970 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2972 float32x2_t lo = vget_low_f32(value);
2973 float32x2_t hi = vget_high_f32(value);
2974 return vzip_f32(hi, lo).val[0];
2980 struct F128SwizzleHelper2<2, 1> {
2983 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2985 float32x2_t lo = vget_low_f32(value);
2986 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2987 return vzip_f32(hi, lo).val[1];
2993 struct F128SwizzleHelper2<2, 3> {
2995 return vget_high_f32(value);
3000 struct F128SwizzleHelper2<3, 0> {
3002 float32x2_t lo = vget_low_f32(value);
3003 float32x2_t hi = vrev64_f32(vget_high_f32(value));
3005 return vzip1_f32(hi, lo);
3007 return vzip_f32(hi, lo).val[0];
3013 struct F128SwizzleHelper2<3, 1> {
3015 float32x2_t lo = vget_low_f32(value);
3016 float32x2_t hi = vget_high_f32(value);
3018 return vzip2_f32(hi, lo);
3020 return vzip_f32(hi, lo).val[1];
3026 struct F128SwizzleHelper2<3, 2> {
3028 return vrev64_f32(vget_high_f32(value));
3032 template <
int V0,
int V1,
int V2,
int V3>
3033 struct F128SwizzleHelper {
3035 return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3036 detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3040 template <
int Vx,
int Vy>
3041 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3043 float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3044 return vcombine_f32(tmp, tmp);
3049 struct F128SwizzleHelper<V, V, V, V> {
3056 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3059 template <
int X0,
int X1>
3060 struct F128SwizzleHelper {
3065 struct F128SwizzleHelper<0, 0> {
3068 return __PS_MERGE00(v0, v0);
3073 struct F128SwizzleHelper<0, 1> {
3081 struct F128SwizzleHelper<0, 2> {
3083 return __PS_MERGE00(v0, v1);
3088 struct F128SwizzleHelper<0, 3> {
3090 return __PS_MERGE01(v0, v1);
3095 struct F128SwizzleHelper<1, 0> {
3098 return __PS_MERGE10(v0, v0);
3103 struct F128SwizzleHelper<1, 1> {
3106 return __PS_MERGE11(v0, v0);
3111 struct F128SwizzleHelper<1, 2> {
3113 return __PS_MERGE10(v0, v1);
3118 struct F128SwizzleHelper<1, 3> {
3120 return __PS_MERGE11(v0, v1);
3125 struct F128SwizzleHelper<2, 0> {
3127 return __PS_MERGE00(v1, v0);
3132 struct F128SwizzleHelper<2, 1> {
3134 return __PS_MERGE01(v1, v0);
3139 struct F128SwizzleHelper<2, 2> {
3142 return __PS_MERGE00(v1, v1);
3147 struct F128SwizzleHelper<2, 3> {
3155 struct F128SwizzleHelper<3, 0> {
3157 return __PS_MERGE10(v1, v0);
3162 struct F128SwizzleHelper<3, 1> {
3164 return __PS_MERGE11(v1, v0);
3169 struct F128SwizzleHelper<3, 2> {
3172 return __PS_MERGE10(v1, v1);
3177 struct F128SwizzleHelper<3, 3> {
3180 return __PS_MERGE11(v1, v1);
3187 template <
int V0,
int V1,
int V2,
int V3>
3194 #if defined(NLIB_F128_SIMD_NOUSE) 3196 ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3197 ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3198 ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3199 ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3201 #elif __has_builtin(__builtin_shufflevector) 3202 return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3203 #elif defined(NLIB_SSE41) 3204 return _mm_shuffle_ps(value, value,
3205 _MM_SHUFFLE(V3 != -1 ? V3 : 3,
3208 V0 != -1 ? V0 : 0));
3209 #elif defined(NLIB_NEON) 3210 return detail::F128SwizzleHelper<
3214 V3 != -1 ? V3 : 3>::Swizzle(value);
3217 ret.vec.ps[0] = detail::F128SwizzleHelper<
3218 (V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3219 ret.vec.ps[1] = detail::F128SwizzleHelper<
3220 (V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(value.vec.ps[0], value.vec.ps[1]);
3225 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3228 NLIB_M(f128) F128::Swizzle<0, 0, 1, 1>(f128arg value)
NLIB_NOEXCEPT {
3230 return vzip1q_f32(value, value);
3232 return vzipq_f32(value, value).val[0];
3236 NLIB_M(f128) F128::Swizzle<0, 0, 2, 2>(f128arg value)
NLIB_NOEXCEPT {
3238 return vtrn1q_f32(value, value);
3240 return vtrnq_f32(value, value).val[0];
3244 NLIB_M(f128) F128::Swizzle<0, 1, 2, 3>(f128arg value)
NLIB_NOEXCEPT {
3248 NLIB_M(f128) F128::Swizzle<0, 2, 0, 2>(f128arg value)
NLIB_NOEXCEPT {
3250 return vuzp1q_f32(value, value);
3252 return vuzpq_f32(value, value).val[0];
3256 NLIB_M(f128) F128::Swizzle<1, 0, 3, 2>(f128arg value)
NLIB_NOEXCEPT {
3257 return vrev64q_f32(value);
3260 NLIB_M(f128) F128::Swizzle<1, 1, 3, 3>(f128arg value)
NLIB_NOEXCEPT {
3262 return vtrn2q_f32(value, value);
3264 return vtrnq_f32(value, value).val[1];
3268 NLIB_M(f128) F128::Swizzle<1, 2, 3, 0>(f128arg value)
NLIB_NOEXCEPT {
3269 uint32x4_t ival = vreinterpretq_u32_f32(value);
3270 uint32x4_t rotated = vextq_u32(ival, ival, 1);
3271 return vreinterpretq_f32_u32(rotated);
3274 NLIB_M(f128) F128::Swizzle<1, 3, 1, 3>(f128arg value)
NLIB_NOEXCEPT {
3276 return vuzp2q_f32(value, value);
3278 return vuzpq_f32(value, value).val[1];
3282 NLIB_M(f128) F128::Swizzle<2, 2, 3, 3>(f128arg value)
NLIB_NOEXCEPT {
3284 return vzip2q_f32(value, value);
3286 return vzipq_f32(value, value).val[1];
3290 NLIB_M(f128) F128::Swizzle<2, 3, 0, 1>(f128arg value)
NLIB_NOEXCEPT {
3291 uint32x4_t ival = vreinterpretq_u32_f32(value);
3292 uint32x4_t rotated = vextq_u32(ival, ival, 2);
3293 return vreinterpretq_f32_u32(rotated);
3296 NLIB_M(f128) F128::Swizzle<3, 0, 1, 2>(f128arg value)
NLIB_NOEXCEPT {
3297 uint32x4_t ival = vreinterpretq_u32_f32(value);
3298 uint32x4_t rotated = vextq_u32(ival, ival, 3);
3299 return vreinterpretq_f32_u32(rotated);
3305 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3306 template <
bool UseBlend,
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3307 struct F128PermuteHelper2 {
3308 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3309 f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3310 f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3311 return _mm_blend_ps(as, bs, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3312 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3316 template <
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3317 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3318 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3319 return _mm_blend_ps(a, b, (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) |
3320 ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3324 template <
int V0,
int V1,
int V2,
int V3>
3325 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3326 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3327 return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3328 _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3333 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3334 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3335 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3336 return _mm_castsi128_ps(tmp);
3341 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3342 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3343 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3344 return _mm_castsi128_ps(tmp);
3349 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3350 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3351 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3352 return _mm_castsi128_ps(tmp);
3357 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3358 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3360 return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3365 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3366 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3368 return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3373 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3374 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3376 return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3381 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3382 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3384 return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3389 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3390 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3392 return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3397 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3398 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3400 return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3405 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3406 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3408 return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3413 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3414 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3416 return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3420 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3421 struct F128PermuteHelper {
3422 static NLIB_M(f128) Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3423 return F128PermuteHelper2<
3424 ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3425 ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3426 V0, V1, V2, V3>::Permute(a, b);
3430 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3433 float32x2_t F128PermuteGet64(f128arg a, f128arg b)
NLIB_NOEXCEPT;
3436 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<0>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3438 return vget_low_f32(a);
3441 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<1>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3443 return vget_high_f32(a);
3446 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<2>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3448 return vget_low_f32(b);
3451 NLIB_ALWAYS_INLINE float32x2_t F128PermuteGet64<3>(f128arg a, f128arg b) NLIB_NOEXCEPT {
3453 return vget_high_f32(b);
3456 template <
int X0,
int X1>
3457 struct F128PermuteHelper2 {
3459 float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3460 float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3461 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3466 struct F128PermuteHelper2<X, X> {
3468 float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3469 return vdup_lane_f32(x, (X & 1));
3473 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3474 struct F128PermuteHelper {
3475 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3476 return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3477 F128PermuteHelper2<V2, V3>::Permute(a, b));
3482 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3483 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3484 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3485 return vreinterpretq_f32_s32(tmp);
3490 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3491 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3492 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3493 return vreinterpretq_f32_s32(tmp);
3498 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3499 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3500 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3501 return vreinterpretq_f32_s32(tmp);
3504 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3505 template<
int R0,
int R1,
int VAR0,
int VAR1>
3506 struct F128PermuteHelper2 {
3507 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT;
3510 template<
int R0,
int R1>
3511 struct F128PermuteHelper2<R0, R1, 0, 0> {
3512 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3513 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3517 template<
int R0,
int R1>
3518 struct F128PermuteHelper2<R0, R1, 0, 1> {
3519 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3520 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3524 template<
int R0,
int R1>
3525 struct F128PermuteHelper2<R0, R1, 0, 2> {
3526 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3527 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3531 template<
int R0,
int R1>
3532 struct F128PermuteHelper2<R0, R1, 0, 3> {
3533 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3534 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3538 template<
int R0,
int R1>
3539 struct F128PermuteHelper2<R0, R1, 1, 0> {
3540 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3541 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3545 template<
int R0,
int R1>
3546 struct F128PermuteHelper2<R0, R1, 1, 1> {
3547 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3548 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3552 template<
int R0,
int R1>
3553 struct F128PermuteHelper2<R0, R1, 1, 2> {
3554 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3555 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3559 template<
int R0,
int R1>
3560 struct F128PermuteHelper2<R0, R1, 1, 3> {
3561 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3562 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3566 template<
int R0,
int R1>
3567 struct F128PermuteHelper2<R0, R1, 2, 0> {
3568 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3569 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3573 template<
int R0,
int R1>
3574 struct F128PermuteHelper2<R0, R1, 2, 1> {
3575 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3576 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3580 template<
int R0,
int R1>
3581 struct F128PermuteHelper2<R0, R1, 2, 2> {
3582 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3583 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3587 template<
int R0,
int R1>
3588 struct F128PermuteHelper2<R0, R1, 2, 3> {
3589 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3590 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3594 template<
int R0,
int R1>
3595 struct F128PermuteHelper2<R0, R1, 3, 0> {
3596 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3597 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3601 template<
int R0,
int R1>
3602 struct F128PermuteHelper2<R0, R1, 3, 1> {
3603 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3604 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3608 template<
int R0,
int R1>
3609 struct F128PermuteHelper2<R0, R1, 3, 2> {
3610 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3611 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3615 template<
int R0,
int R1>
3616 struct F128PermuteHelper2<R0, R1, 3, 3> {
3617 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3) NLIB_NOEXCEPT {
3618 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3622 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3623 struct F128PermuteHelper {
3624 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3626 f32x2 x0 = a.vec.ps[0];
3627 f32x2 x1 = a.vec.ps[1];
3628 f32x2 x2 = b.vec.ps[0];
3629 f32x2 x3 = b.vec.ps[1];
3630 ret.vec.ps[0] = F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>
3631 ::Permute(x0, x1, x2, x3);
3632 ret.vec.ps[1] = F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>
3633 ::Permute(x0, x1, x2, x3);
3638 template <
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3639 struct F128PermuteHelper {
3640 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3641 f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3642 F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3643 F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3644 F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3650 template <
int V0,
int V1,
int V2,
int V3>
3651 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3652 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3654 return F128::Swizzle<V0, V1, V2, V3>(a);
3658 template <
int V0,
int V1,
int V2,
int V3>
3659 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3660 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3662 return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3666 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3669 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3670 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3671 return _mm_unpacklo_ps(a, b);
3675 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3676 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3677 return _mm_unpacklo_ps(b, a);
3681 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3682 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3683 return _mm_unpackhi_ps(a, b);
3687 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3688 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3689 return _mm_unpackhi_ps(b, a);
3694 template<
int V0,
int V1,
int V2,
int V3>
3695 struct F128PermuteDontCareHelper {
3696 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3701 static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3702 static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3703 return detail::F128PermuteHelper< arg1, arg2,
3704 V0, V1, V2, V3 >::Permute(a, b);
3708 template<
int V1,
int V2,
int V3>
3709 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3710 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3714 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3715 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3719 template<
int V0,
int V2,
int V3>
3720 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3721 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3725 static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3726 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3730 template<
int V0,
int V1,
int V3>
3731 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3732 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3736 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3737 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3741 template<
int V0,
int V1,
int V2>
3742 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3743 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3747 static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3748 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3752 template<
int V2,
int V3>
3753 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3754 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3757 static const int V0 = (V2 < 4) ? 0 : 4;
3758 return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3762 template<
int V1,
int V2>
3763 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3764 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3767 static const int V0 = (V1 & 1) ? V1 - 1: V1;
3768 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3769 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3773 template<
int V0,
int V1>
3774 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3775 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3778 static const int V2 = (V1 < 4) ? 2 : 6;
3779 return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3783 template<
int V0,
int V3>
3784 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3785 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3788 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3789 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3790 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3794 template<
int V0,
int V2>
3795 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3796 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3799 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3800 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3801 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3805 template<
int V1,
int V3>
3806 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3807 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3810 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3811 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3812 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3817 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3818 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3820 static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3821 static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3822 static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3823 return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3828 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3829 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3831 static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3832 static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3833 static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3834 return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3839 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3840 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3842 static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3843 static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3844 static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3845 return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3850 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3851 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3853 static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3854 static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3855 static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3856 return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3861 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3862 static NLIB_M(f128) Permute(f128arg a, f128arg b) NLIB_NOEXCEPT {
3870 template <
int V0,
int V1,
int V2,
int V3>
3872 NLIB_M(f128) F128::Permute(f128arg a, f128arg b)
NLIB_NOEXCEPT {
3873 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE) 3874 return __builtin_shufflevector(a, b,
3875 (V0 != 8 ? V0 : -1),
3876 (V1 != 8 ? V1 : -1),
3877 (V2 != 8 ? V2 : -1),
3878 (V3 != 8 ? V3 : -1));
3880 return detail::F128PermuteDontCareHelper <
3884 V3 != -1 ? V3 : 8>::Permute(a, b);
3888 template <
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
3891 NLIB_M(f128) F128::Splat(f128arg value, f128arg splat)
NLIB_NOEXCEPT {
3892 #if defined(NLIB_NEON) 3893 const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3894 const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3895 const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3896 const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3899 const int v0 = SplatLane0 ? 4 : 0;
3900 const int v1 = SplatLane1 ? 5 : 1;
3901 const int v2 = SplatLane2 ? 6 : 2;
3902 const int v3 = SplatLane3 ? 7 : 3;
3904 return F128::Permute<v0, v1, v2, v3>(value, splat);
3908 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 3910 ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3911 ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3912 ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3913 ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3916 i128 iround = F128::ConvertToI128Round(value);
3917 f128 fround = F128::ConvertFromI128(iround);
3918 f128 x = F128::Sub(value, fround);
3919 f128 xx = F128::Mult(x, x);
3921 f128 P = F128::LoadA16(F128::exp2_P_);
3922 f128 Q = F128::LoadA16(F128::exp2_Q_);
3930 px = F128::MultAdd(px, xx, F128::SetValue<2>(P,
each_select32));
3931 px = F128::Mult(x, px);
3937 qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q,
each_select32));
3939 x = F128::Div(px, F128::Sub(qx, px));
3943 iround = I128::Add32(iround, I128::SetValue(127,
each_int32));
3944 iround = I128::ShiftLeftLogical32(iround, 23);
3945 x = F128::Mult(x, F128::CastFromI128(iround));
3954 static const float log2e = 1.44269504088896340736f;
3955 return Exp2(F128::Mult(log2e, value));
3959 static const float log2e = 1.44269504088896340736f;
3960 f128 neg_one = F128::SetValue(-1.f, each_float);
3961 f128 v0 = F128::MultAdd(log2e, value, neg_one);
3962 f128 v1 = F128::MultSub(log2e, value, neg_one);
3965 return F128::Sub(e0, e1);
3969 static const float log2e = 1.44269504088896340736f;
3970 f128 neg_one = F128::SetValue(-1.f, each_float);
3971 f128 v0 = F128::MultAdd(log2e, value, neg_one);
3972 f128 v1 = F128::MultSub(log2e, value, neg_one);
3975 return F128::Add(e0, e1);
3980 f128 cvalue = F128::LoadA16(tanh_cvalue_);
3984 e = F128::MultAdd(half, e, half);
3986 return F128::Sub(F128::SetValue<1>(cvalue,
each_select32), e);
3990 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 3992 ret.vec.v[0] = tanf(value.vec.v[0]);
3993 ret.vec.v[1] = tanf(value.vec.v[1]);
3994 ret.vec.v[2] = tanf(value.vec.v[2]);
3995 ret.vec.v[3] = tanf(value.vec.v[3]);
3999 f128 C = F128::LoadA16(&F128::tan_c_[0]);
4002 f128 g = F128::Round(F128::Mult<0>(C, value,
each_select32));
4005 i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U,
each_uint32));
4006 i128 cmp = I128::CmpEq32(t0, I128::SetZero());
4007 nearx_axis = F128::CastFromI128(cmp);
4014 f128 near_axis = F128::CmpNearEqZero(f, F128::SetValue<3>(C,
each_select32));
4016 f128 P = F128::LoadA16(&F128::tan_p_[0]);
4017 f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4019 f128 ff = F128::Mult(f, f);
4023 p = F128::MultAdd(p, ff, F128::SetValue<0>(P,
each_select32));
4024 p = F128::MultAdd(p, ff, one);
4025 p = F128::Mult(f, p);
4028 q = F128::MultAdd(q, ff, F128::SetValue<1>(Q,
each_select32));
4029 q = F128::MultAdd(q, ff, F128::SetValue<0>(Q,
each_select32));
4030 q = F128::MultAdd(q, ff, one);
4032 p = F128::Select(near_axis, f, p);
4033 q = F128::Select(near_axis, one, q);
4035 f128 r0 = F128::Div(p, q);
4036 f128 r1 = F128::Negate(F128::Recp(r0));
4038 return F128::Select(nearx_axis, r0, r1);
4043 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 4044 static const float scale = 1.4426950408889634f;
4046 ret.vec.v[0] = logf(value.vec.v[0]);
4047 ret.vec.v[1] = logf(value.vec.v[1]);
4048 ret.vec.v[2] = logf(value.vec.v[2]);
4049 ret.vec.v[3] = logf(value.vec.v[3]);
4050 return F128::Mult(scale, ret);
4053 f128 x = F128::And(F128::SetValue(0x807FFFFFU,
each_uint32), value);
4054 x = F128::Or(F128::SetValue(127U << 23,
each_uint32), x);
4055 i128 e = I128::And(I128::SetValue(0x7F800000U,
each_uint32), F128::CastToI128(value));
4056 e = I128::ShiftRightLogical32(e, 23);
4057 e = I128::Sub32(e, I128::SetValue(127U,
each_uint32));
4059 x = F128::Sub(x, F128::SetOne());
4060 f128 z = F128::Mult(x, x);
4063 f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4064 f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4065 f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4068 p = F128::MultAdd(p, x, F128::SetValue<1>(pq0,
each_select32));
4069 p = F128::MultAdd(p, x, F128::SetValue<2>(pq0,
each_select32));
4070 p = F128::MultAdd(p, x, F128::SetValue<3>(pq0,
each_select32));
4071 p = F128::MultAdd(p, x, F128::SetValue<0>(pq1,
each_select32));
4072 p = F128::MultAdd(p, x, F128::SetValue<1>(pq1,
each_select32));
4074 f128 q = F128::Add(x, F128::SetValue<2>(pq1,
each_select32));
4075 q = F128::MultAdd(q, x, F128::SetValue<3>(pq1,
each_select32));
4076 q = F128::MultAdd(q, x, F128::SetValue<0>(pq2,
each_select32));
4077 q = F128::MultAdd(q, x, F128::SetValue<1>(pq2,
each_select32));
4078 q = F128::MultAdd(q, x, F128::SetValue<2>(pq2,
each_select32));
4080 y = F128::Mult(z, p);
4081 y = F128::Div(y, q);
4082 y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4088 result = F128::Mult(y, log2ea);
4089 result = F128::MultAdd(log2ea, x, result);
4090 result = F128::Add(result, y);
4091 result = F128::Add(result, x);
4092 result = F128::Add(result, F128::ConvertFromI128(e));
4096 f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4099 f128 is_nan = F128::IsNaN(value);
4101 result = F128::Select(is_nan, nan, result);
4103 f128 is_inf = F128::IsInfinite(value);
4104 f128 is_pos = F128::CmpGtZero(value);
4108 f128 is_pos_inf = F128::And(is_inf, is_pos);
4109 result = F128::Select(is_pos_inf, inf, result);
4113 f128 is_zero = F128::CmpEqZero(value);
4114 result = F128::Select(is_zero, neg_inf, result);
4118 f128 is_neg = F128::CmpLtZero(value);
4119 result = F128::Select(is_neg, neg_nan, result);
4129 #ifdef NLIB_F128_SIMD_NOUSE 4131 ret.vec.v[0] = logf(value.vec.v[0]);
4132 ret.vec.v[1] = logf(value.vec.v[1]);
4133 ret.vec.v[2] = logf(value.vec.v[2]);
4134 ret.vec.v[3] = logf(value.vec.v[3]);
4137 f128 x = F128::Log2(value);
4138 static const float recp_log2e = 0.6931471805597018f;
4139 return F128::Mult(recp_log2e, x);
4145 #endif // NLIB_DOXYGEN 4156 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4169 SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
float m12,
4170 float m13,
float m20,
float m21,
float m22,
float m23,
float m30,
float m31,
4178 inline SimdMatrix::SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
4179 float m12,
float m13,
float m20,
float m21,
float m22,
float m23,
4181 r[0] = F128::SetValue(m00, m01, m02, m03);
4182 r[1] = F128::SetValue(m10, m11, m12, m13);
4183 r[2] = F128::SetValue(m20, m21, m22, m23);
4184 r[3] = F128::SetValue(m30, m31, m32, m33);
4188 uintptr_t algn =
reinterpret_cast<uintptr_t
>(p) & 15;
4189 NLIB_ASSERT((algn & 3) == 0);
4190 switch (algn >> 2) {
4192 r[0] = F128::LoadA16(p);
4193 r[1] = F128::LoadA16(p + 4);
4194 r[2] = F128::LoadA16(p + 8);
4195 r[3] = F128::LoadA16(p + 12);
4198 r[0] = F128::LoadA4(p);
4199 r[1] = F128::LoadA4(p + 4);
4200 r[2] = F128::LoadA4(p + 8);
4201 r[3] = F128::LoadA4(p + 12);
4204 r[0] = F128::LoadA8(p);
4205 r[1] = F128::LoadA8(p + 4);
4206 r[2] = F128::LoadA8(p + 8);
4207 r[3] = F128::LoadA8(p + 12);
4215 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 4221 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE) 4222 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4224 f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \ 4225 f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \ 4226 f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \ 4227 f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \ 4228 row0 = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \ 4229 row1 = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \ 4230 row2 = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \ 4231 row3 = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \ 4233 #elif defined(NLIB_NEON) 4235 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4237 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4238 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4239 uint64x2_t row0_, row1_, row2_, row3_; \ 4240 row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4241 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4242 row0 = vreinterpretq_f32_u64(row0_); \ 4243 row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4244 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4245 row1 = vreinterpretq_f32_u64(row1_); \ 4246 row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4247 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4248 row2 = vreinterpretq_f32_u64(row2_); \ 4249 row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4250 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4251 row3 = vreinterpretq_f32_u64(row3_); \ 4254 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4256 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4257 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4258 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \ 4259 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \ 4260 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \ 4261 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \ 4265 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4268 tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \ 4269 tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \ 4270 row0.vec.ps[0] = tmp0; \ 4271 row1.vec.ps[0] = tmp1; \ 4272 tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \ 4273 tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \ 4274 row2.vec.ps[1] = tmp0; \ 4275 row3.vec.ps[1] = tmp1; \ 4276 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4277 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4278 row0.vec.ps[1] = row2.vec.ps[0]; \ 4279 row1.vec.ps[1] = row3.vec.ps[0]; \ 4280 row2.vec.ps[0] = tmp0; \ 4281 row3.vec.ps[0] = tmp1; \ 4282 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4283 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4284 row0.vec.ps[1] = tmp0; \ 4285 row1.vec.ps[1] = tmp1; \ 4310 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4318 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4326 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4337 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_
SimdMatrix()
デフォルトコンストラクタです。
f128arg SimdVectorArg
f128argがtypedefされています。
128bitの単精度浮動小数点数用SIMDレジスタを2つ持つ型です。
整数のSIMD演算を行うためのクラスや関数が実装されています。
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
引数から行列をセットアップします。
空の構造体で32bit単位に分けたレーンを選択することを示すためのタグです。
f128arg SimdSphereArg
f128argがtypedefされています。
OBB(有向境界ボックス)を表すクラスです。中心座標(center)とxyz軸方向の大きさ(extent)及び回転クォータニ...
static f128 ShiftRight(f128arg a, f128arg b) noexcept
a を右にシフトして空いた部分にb の要素を順にシフトする形で設定します。
空の構造体で単精度浮動小数点数を示すためのタグです。
3次元空間上の球を扱う静的メンバ関数が集められたクラスです。このクラスはインスタンス化できません。 ...
constexpr const each_float_tag each_float
each_float_tag型の定数オブジェクトで、単精度浮動小数点数を示すためのタグです。
f128arg SimdQuaternionArg
f128argがtypedefされています。
nlib_i128_t i128
nlib_i128_tがtypedefされています。
3次元空間上の平面を扱う関数が集められたクラスです。
f128arg SimdPlaneArg
f128argがtypedefされています。
3次元ベクトルの計算を行う関数が集められたクラスです。全ての関数でレーン3に設定された値は無視されます...
static f128 RotateLeft(f128arg value) noexcept
4個の単精度浮動小数点数を左にN 個分回転させます。
距離(の2乗)の計算を行う関数をまとめたクラスです。
4次元ベクトルをメモリから読み出したりメモリに書き出したりするための型です。float型のx, y, z, wをデータメンバとして保持します。
const f128 f128arg
const f128, 又はconst f128&がtypedefされています。
nlib_f128x2_t f128x2
nlib_f128x2_tがtypedefされています。
f128 SimdSphere
f128がtypedefされています。球を扱う場合に利用されます。
128bitレジスタ(SSEではXMM0-XMM15, NEONではQ0-Q15)を用いて単精度浮動小数点数のSIMD演算を行うためのクラ...
constexpr const each_uint32_tag each_uint32
each_uint32_tag型の定数オブジェクトで、32bitの符号なし整数を示すためのタグです。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
#define NLIB_CEXPR
利用可能であればconstexprが定義されます。そうでない場合は空文字列です。
4次元ベクトルの計算を行う関数が集められたクラスです。
3次元ベクトルをメモリから読み出したりメモリに書き出したりするための型です。float型のx, y, zをデータメンバとして保持します。
#define NLIB_ALIGNAS(x)
alignas(x)又は同等の定義がされます。
constexpr const each_int8_tag each_int8
each_int8_tag型の定数オブジェクトで、8bitの符号付き整数を示すためのタグです。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
4x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x3の配列で16バイ...
3x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x3の配列です。 ...
空の構造体で32bitの符号なし整数を示すためのタグです。
static f128 RotateRight(f128arg value) noexcept
4個の単精度浮動小数点数を右にN 個分回転させます。
nlib_f128_t f128
nlib_f128_tがtypedefされています。
3次元空間におけるAABB(軸並行境界ボックス)を表すクラスです。最小座標(point_min)と最大座標(point_max)を...
constexpr const each_int32_tag each_int32
each_int32_tag型の定数オブジェクトで、32bitの符号付き整数を示すためのタグです。
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
f128 SimdQuaternion
f128がtypedefされています。クォータニオンを扱う場合に利用されます。
4x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x4の配列で16バイ...
3x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x4の配列で16バイ...
f128 SimdPlane
f128がtypedefされています。平面を扱う場合に利用されます。
__m128 nlib_f128_t
128bitの単精度浮動小数点数用SIMDレジスタのための型です。
f128 SimdVector
f128がtypedefされています。3次元ベクトル又は4次元ベクトルを扱う場合に利用されます。 ...