16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ 19 #ifdef NN_PLATFORM_CTR 20 #ifndef __USE_C99_MATH 21 #define __USE_C99_MATH 31 #define INFINITY ((float)(1e+300 * 1e+300)) 34 #if !defined(NLIB_SIMD) && !defined(CAFE) 35 #define NLIB_F128_SIMD_NOUSE 38 #ifdef NLIB_F128_SIMD_NOUSE 48 #elif defined(NLIB_SSE41) 53 #elif defined(NLIB_NEON) 81 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 87 #if defined(_MSC_VER) || !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 88 typedef const f128& f128arg_ex;
90 typedef const f128 f128arg_ex;
93 #if !defined(_MSC_VER) && !defined(__vectorcall) 101 static f128 __vectorcall SetValue(
float a,
float b,
float c,
float d)
NLIB_NOEXCEPT;
158 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 180 template<
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
328 template<
int V0,
int V1,
int V2,
int V3>
330 template<
int V0,
int V1,
int V2,
int V3>
332 template<
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
339 const size_t NN = 4 - N;
340 return Swizzle<(NN & 3), ((NN + 1) & 3), ((NN + 2) & 3), ((NN + 3) & 3)>(value);
346 return Swizzle<(N & 3), ((N + 1) & 3), ((N + 2) & 3), ((N + 3) & 3)>(value);
352 return Permute<N, (N + 1), (N + 2), (N + 3)>(a, b);
406 #define NLIB_M(tp) NLIB_ALWAYS_INLINE tp __vectorcall 407 #define NLIB_M2(tp) inline tp __vectorcall 411 #ifdef NLIB_F128_SIMD_NOUSE 418 #elif defined(NLIB_SSE41) 419 return _mm_set1_ps(v);
420 #elif defined(NLIB_NEON) 421 return vdupq_n_f32(v);
424 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(v);
431 #ifdef NLIB_F128_SIMD_NOUSE 438 #elif defined(NLIB_SSE41) 444 return _mm_set1_ps(tmp.f32);
445 #elif defined(NLIB_NEON) 446 uint32x4_t tmp = vdupq_n_u32(v);
447 return vreinterpretq_f32_u32(tmp);
455 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(tmp.f32);
462 #ifdef NLIB_F128_SIMD_NOUSE 469 #elif defined(NLIB_SSE41) 470 return _mm_set_ps(d, c, b, a);
471 #elif defined(NLIB_NEON) 480 return vcombine_f32(vcreate_f32(tmp1.u64), vcreate_f32(tmp2.u64));
483 ret.vec.ps[0][0] = a;
484 ret.vec.ps[0][1] = b;
485 ret.vec.ps[1][0] = c;
486 ret.vec.ps[1][1] = d;
495 #ifdef NLIB_F128_SIMD_NOUSE 497 ret.vec.v[0] = value.vec.v[N];
498 ret.vec.v[1] = value.vec.v[N];
499 ret.vec.v[2] = value.vec.v[N];
500 ret.vec.v[3] = value.vec.v[N];
502 #elif defined(NLIB_SSE41) 503 return _mm_shuffle_ps(value, value, _MM_SHUFFLE(N, N, N, N));
504 #elif defined(NLIB_NEON) 505 float32x2_t tmp = vget_low_f32(value);
506 return vdupq_lane_f32(tmp, N);
509 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(value.vec.ps[N / 2][N % 2]);
514 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 518 float32x2_t tmp = vget_high_f32(value);
519 return vdupq_lane_f32(tmp, 0);
524 float32x2_t tmp = vget_high_f32(value);
525 return vdupq_lane_f32(tmp, 1);
527 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 532 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[0], value.vec.ps[0]);
539 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[0], value.vec.ps[0]);
546 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE00(value.vec.ps[1], value.vec.ps[1]);
553 ret.vec.ps[0] = ret.vec.ps[1] = __PS_MERGE11(value.vec.ps[1], value.vec.ps[1]);
560 #ifdef NLIB_F128_SIMD_NOUSE 567 #elif defined(NLIB_SSE41) 568 return _mm_setzero_ps();
569 #elif defined(NLIB_NEON) 570 return vdupq_n_f32(0);
573 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(0.f);
579 #ifdef NLIB_F128_SIMD_NOUSE 586 #elif defined(NLIB_NEON) 587 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
588 float32x2_t x00 = vcreate_f32(0ULL);
589 return vcombine_f32(x10, x00);
591 return F128::LoadA16(F128::v1000_);
596 #ifdef NLIB_F128_SIMD_NOUSE 603 #elif defined(NLIB_NEON) 604 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
605 float32x2_t x00 = vcreate_f32(0ULL);
606 return vcombine_f32(x01, x00);
608 return F128::LoadA16(F128::v0100_);
613 #ifdef NLIB_F128_SIMD_NOUSE 620 #elif defined(NLIB_NEON) 621 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
622 float32x2_t x00 = vcreate_f32(0ULL);
623 return vcombine_f32(x00, x10);
625 return F128::LoadA16(F128::v0010_);
630 #ifdef NLIB_F128_SIMD_NOUSE 637 #elif defined(NLIB_NEON) 638 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
639 float32x2_t x00 = vcreate_f32(0ULL);
640 return vcombine_f32(x00, x01);
642 return F128::LoadA16(F128::v0001_);
650 #ifdef NLIB_F128_SIMD_NOUSE 654 #elif defined(NLIB_SSE41) 655 return _mm_insert_ps(value, value, 1 << N);
656 #elif defined(NLIB_NEON) 657 return F128::Permute < N == 0 ? 4 : 0, N == 1 ? 5 : 1, N == 2 ? 6 : 2,
658 N == 3 ? 7 : 3 > (value, vdupq_n_f32(0.f));
662 ret.vec.ps[N / 2][N % 2] = 0.f;
699 #ifdef NLIB_F128_SIMD_NOUSE 706 #elif defined(NLIB_SSE41) 707 return _mm_load_ps(p);
708 #elif defined(NLIB_NEON) 709 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
710 uint64x2_t val = vld1q_u64(tmp);
711 return vreinterpretq_f32_u64(val);
714 ret.vec.ps[0][0] = p[0];
715 ret.vec.ps[0][1] = p[1];
716 ret.vec.ps[1][0] = p[2];
717 ret.vec.ps[1][1] = p[3];
724 #ifdef NLIB_F128_SIMD_NOUSE 726 #elif defined(NLIB_SSE41) 727 return _mm_loadu_ps(p);
728 #elif defined(NLIB_NEON) 732 ret.vec.ps[0][0] = p[0];
733 ret.vec.ps[0][1] = p[1];
734 ret.vec.ps[1][0] = p[2];
735 ret.vec.ps[1][1] = p[3];
742 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 743 const uint64_t* tmp =
reinterpret_cast<const uint64_t*
>(p);
744 uint64x2_t val = vld1q_u64(tmp);
745 return vreinterpretq_f32_u64(val);
753 return LoadA16(reinterpret_cast<const float*>(p));
758 return LoadA8(reinterpret_cast<const float*>(p));
763 return LoadA4(reinterpret_cast<const float*>(p));
768 return LoadA16(reinterpret_cast<const float*>(p));
773 return LoadA8(reinterpret_cast<const float*>(p));
778 return LoadA4(reinterpret_cast<const float*>(p));
783 #ifdef NLIB_F128_SIMD_NOUSE 784 p[0] = value.vec.v[0];
785 p[1] = value.vec.v[1];
786 p[2] = value.vec.v[2];
787 p[3] = value.vec.v[3];
788 #elif defined(NLIB_SSE41) 789 _mm_store_ps(p, value);
790 #elif defined(NLIB_NEON) 791 uint64x2_t tmp = vreinterpretq_u64_f32(value);
792 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
794 p[0] = value.vec.ps[0][0];
795 p[1] = value.vec.ps[0][1];
796 p[2] = value.vec.ps[1][0];
797 p[3] = value.vec.ps[1][1];
803 #ifdef NLIB_F128_SIMD_NOUSE 805 #elif defined(NLIB_SSE41) 806 _mm_storeu_ps(p, value);
807 #elif defined(NLIB_NEON) 810 p[0] = value.vec.ps[0][0];
811 p[1] = value.vec.ps[0][1];
812 p[2] = value.vec.ps[1][0];
813 p[3] = value.vec.ps[1][1];
819 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 820 uint64x2_t tmp = vreinterpretq_u64_f32(value);
821 vst1q_u64(reinterpret_cast<uint64_t*>(p), tmp);
829 StoreA16(reinterpret_cast<float*>(p), value);
834 StoreA8(reinterpret_cast<float*>(p), value);
839 StoreA4(reinterpret_cast<float*>(p), value);
844 StoreA16(reinterpret_cast<float*>(p), value);
849 StoreA8(reinterpret_cast<float*>(p), value);
854 StoreA4(reinterpret_cast<float*>(p), value);
859 #ifdef NLIB_F128_SIMD_NOUSE 860 p[0] = value.vec.v[0];
861 p[1] = value.vec.v[1];
862 #elif defined(NLIB_SSE41) 863 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
864 #elif defined(NLIB_NEON) 865 uint64x1_t tmp = vget_low_u64(vreinterpretq_u64_f32(value));
866 vst1_u64(reinterpret_cast<uint64_t*>(p), tmp);
868 p[0] = value.vec.ps[0][0];
869 p[1] = value.vec.ps[0][1];
875 #ifdef NLIB_F128_SIMD_NOUSE 876 p[0] = value.vec.v[0];
877 p[1] = value.vec.v[1];
878 #elif defined(NLIB_SSE41) 879 _mm_storel_pi(reinterpret_cast<__m64*>(p), value);
880 #elif defined(NLIB_NEON) 881 float32x2_t tmp = vget_low_f32(value);
884 p[0] = value.vec.ps[0][0];
885 p[1] = value.vec.ps[0][1];
891 StoreLoA8(reinterpret_cast<float*>(p), value);
896 StoreLoA4(reinterpret_cast<float*>(p), value);
901 StoreLoA8(reinterpret_cast<float*>(p), value);
906 StoreLoA4(reinterpret_cast<float*>(p), value);
911 #ifdef NLIB_F128_SIMD_NOUSE 912 p[0] = value.vec.v[2];
913 p[1] = value.vec.v[3];
914 #elif defined(NLIB_SSE41) 915 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
916 #elif defined(NLIB_NEON) 917 vst1_f32(p, vget_high_f32(value));
919 p[0] = value.vec.ps[1][0];
920 p[1] = value.vec.ps[1][1];
926 #ifdef NLIB_F128_SIMD_NOUSE 927 p[0] = value.vec.v[2];
928 p[1] = value.vec.v[3];
929 #elif defined(NLIB_SSE41) 930 _mm_storeh_pi(reinterpret_cast<__m64*>(p), value);
931 #elif defined(NLIB_NEON) 932 float32x2_t tmp = vget_high_f32(value);
935 p[0] = value.vec.ps[1][0];
936 p[1] = value.vec.ps[1][1];
942 StoreHiA8(reinterpret_cast<float*>(p), value);
947 StoreHiA4(reinterpret_cast<float*>(p), value);
952 StoreHiA8(reinterpret_cast<float*>(p), value);
957 StoreHiA4(reinterpret_cast<float*>(p), value);
962 #ifdef NLIB_F128_SIMD_NOUSE 964 ret.vec.v[0] = value.vec.v[0] > 0 ? value.vec.v[0] : -value.vec.v[0];
965 ret.vec.v[1] = value.vec.v[1] > 0 ? value.vec.v[1] : -value.vec.v[1];
966 ret.vec.v[2] = value.vec.v[2] > 0 ? value.vec.v[2] : -value.vec.v[2];
967 ret.vec.v[3] = value.vec.v[3] > 0 ? value.vec.v[3] : -value.vec.v[3];
969 #elif defined(NLIB_NEON) 970 return vabsq_f32(value);
971 #elif defined(NLIB_SSE41) 972 const __m128 signmask = _mm_set1_ps(-0.0f);
973 return _mm_andnot_ps(signmask, value);
976 ret.vec.ps[0] = __PS_ABS(value.vec.ps[0]);
977 ret.vec.ps[1] = __PS_ABS(value.vec.ps[1]);
984 #ifdef NLIB_F128_SIMD_NOUSE 986 result.vec.u[0] = (a.vec.u[0] & mask.vec.u[0]) | (b.vec.u[0] & ~mask.vec.u[0]);
987 result.vec.u[1] = (a.vec.u[1] & mask.vec.u[1]) | (b.vec.u[1] & ~mask.vec.u[1]);
988 result.vec.u[2] = (a.vec.u[2] & mask.vec.u[2]) | (b.vec.u[2] & ~mask.vec.u[2]);
989 result.vec.u[3] = (a.vec.u[3] & mask.vec.u[3]) | (b.vec.u[3] & ~mask.vec.u[3]);
991 #elif defined(NLIB_SSE41) 992 return _mm_blendv_ps(b, a, mask);
993 #elif defined(NLIB_NEON) 994 return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
998 mask_.vec.u[0] &= 0xFF7FFFFFUL;
999 mask_.vec.u[1] &= 0xFF7FFFFFUL;
1000 mask_.vec.u[2] &= 0xFF7FFFFFUL;
1001 mask_.vec.u[3] &= 0xFF7FFFFFUL;
1004 ret.vec.ps[0] = __PS_SEL(mask_.vec.ps[0], b.vec.ps[0], a.vec.ps[0]);
1005 ret.vec.ps[1] = __PS_SEL(mask_.vec.ps[1], b.vec.ps[1], a.vec.ps[1]);
1057 #if !defined(NLIB_F128_SIMD_NOUSE) && !defined(CAFE) 1060 #if defined(NLIB_SSE41) 1061 return _mm_cvtepi32_ps(value);
1062 #elif defined(NLIB_NEON) 1063 return vcvtq_f32_s32(vreinterpretq_s32_s8(value));
1069 #if defined(NLIB_SSE41) 1070 return _mm_castsi128_ps(value);
1071 #elif defined(NLIB_NEON) 1072 return vreinterpretq_f32_s8(value);
1078 #if defined(NLIB_SSE41) 1079 return _mm_cvtps_epi32(value);
1080 #elif defined(NLIB_NEON) 1081 uint32x4_t half = vreinterpretq_u32_f32(vdupq_n_f32(0.5f));
1082 uint32x4_t sgn = vdupq_n_u32(0x80000000U);
1083 uint32x4_t w = vandq_u32(vreinterpretq_u32_f32(value), sgn);
1084 w = vorrq_u32(w, half);
1085 return vreinterpretq_s8_s32(vcvtq_s32_f32(vaddq_f32(value, vreinterpretq_f32_u32(w))));
1090 #if defined(NLIB_SSE41) 1091 return _mm_cvttps_epi32(value);
1092 #elif defined(NLIB_NEON) 1093 return vreinterpretq_s8_s32(vcvtq_s32_f32(value));
1099 #if defined(NLIB_SSE41) 1100 return _mm_castps_si128(value);
1101 #elif defined(NLIB_NEON) 1102 return vreinterpretq_s8_f32(value);
1110 #if defined(NLIB_NEON) 1111 return vcvtq_n_f32_s32(vreinterpretq_s32_s8(value), N);
1113 f128 f = F128::ConvertFromI128(value);
1115 return F128::Mult(f, m);
1123 #if defined(NLIB_NEON) 1124 return vreinterpretq_s8_s32(vcvtq_n_s32_f32(value, N));
1127 f128 f = F128::Mult(value, m);
1128 return F128::ConvertToI128Truncate(f);
1136 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1138 ret.vec.u[0] = (a.vec.v[0] < b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1139 ret.vec.u[1] = (a.vec.v[1] < b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1140 ret.vec.u[2] = (a.vec.v[2] < b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1141 ret.vec.u[3] = (a.vec.v[3] < b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1143 #elif defined(NLIB_SSE41) 1144 return _mm_cmplt_ps(a, b);
1145 #elif defined(NLIB_NEON) 1146 uint32x4_t tmp = vcltq_f32(a, b);
1147 return vreinterpretq_f32_u32(tmp);
1153 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1155 ret.vec.u[0] = (a.vec.v[0] <= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1156 ret.vec.u[1] = (a.vec.v[1] <= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1157 ret.vec.u[2] = (a.vec.v[2] <= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1158 ret.vec.u[3] = (a.vec.v[3] <= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1160 #elif defined(NLIB_SSE41) 1161 return _mm_cmple_ps(a, b);
1162 #elif defined(NLIB_NEON) 1163 uint32x4_t tmp = vcleq_f32(a, b);
1164 return vreinterpretq_f32_u32(tmp);
1170 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1172 ret.vec.u[0] = (a.vec.v[0] > b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1173 ret.vec.u[1] = (a.vec.v[1] > b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1174 ret.vec.u[2] = (a.vec.v[2] > b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1175 ret.vec.u[3] = (a.vec.v[3] > b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1177 #elif defined(NLIB_SSE41) 1178 return _mm_cmpgt_ps(a, b);
1179 #elif defined(NLIB_NEON) 1180 uint32x4_t tmp = vcgtq_f32(a, b);
1181 return vreinterpretq_f32_u32(tmp);
1187 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1189 ret.vec.u[0] = (a.vec.v[0] >= b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1190 ret.vec.u[1] = (a.vec.v[1] >= b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1191 ret.vec.u[2] = (a.vec.v[2] >= b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1192 ret.vec.u[3] = (a.vec.v[3] >= b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1194 #elif defined(NLIB_SSE41) 1195 return _mm_cmpge_ps(a, b);
1196 #elif defined(NLIB_NEON) 1197 uint32x4_t tmp = vcgeq_f32(a, b);
1198 return vreinterpretq_f32_u32(tmp);
1204 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1206 ret.vec.u[0] = (a.vec.v[0] != b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1207 ret.vec.u[1] = (a.vec.v[1] != b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1208 ret.vec.u[2] = (a.vec.v[2] != b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1209 ret.vec.u[3] = (a.vec.v[3] != b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1211 #elif defined(NLIB_SSE41) 1212 return _mm_cmpneq_ps(a, b);
1213 #elif defined(NLIB_NEON) 1214 uint32x4_t tmp = vmvnq_u32(vceqq_f32(a, b));
1215 return vreinterpretq_f32_u32(tmp);
1221 #ifdef NLIB_F128_SIMD_NOUSE 1223 ret.vec.v[0] = a.vec.v[0] + b.vec.v[0];
1224 ret.vec.v[1] = a.vec.v[1] + b.vec.v[1];
1225 ret.vec.v[2] = a.vec.v[2] + b.vec.v[2];
1226 ret.vec.v[3] = a.vec.v[3] + b.vec.v[3];
1228 #elif defined(NLIB_SSE41) 1229 return _mm_add_ps(a, b);
1230 #elif defined(NLIB_NEON) 1231 return vaddq_f32(a, b);
1234 ret.vec.ps[0] = __PS_ADD(a.vec.ps[0], b.vec.ps[0]);
1235 ret.vec.ps[1] = __PS_ADD(a.vec.ps[1], b.vec.ps[1]);
1242 #ifdef NLIB_F128_SIMD_NOUSE 1244 ret.vec.v[0] = a.vec.v[0] - b.vec.v[0];
1245 ret.vec.v[1] = a.vec.v[1] - b.vec.v[1];
1246 ret.vec.v[2] = a.vec.v[2] - b.vec.v[2];
1247 ret.vec.v[3] = a.vec.v[3] - b.vec.v[3];
1249 #elif defined(NLIB_SSE41) 1250 return _mm_sub_ps(a, b);
1251 #elif defined(NLIB_NEON) 1252 return vsubq_f32(a, b);
1255 ret.vec.ps[0] = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1256 ret.vec.ps[1] = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1263 #ifdef NLIB_F128_SIMD_NOUSE 1265 ret.vec.v[0] = -value.vec.v[0];
1266 ret.vec.v[1] = -value.vec.v[1];
1267 ret.vec.v[2] = -value.vec.v[2];
1268 ret.vec.v[3] = -value.vec.v[3];
1270 #elif defined(NLIB_NEON) 1271 return vnegq_f32(value);
1272 #elif defined(NLIB_SSE41) 1273 const __m128 signmask = _mm_set1_ps(-0.0f);
1274 return _mm_xor_ps(signmask, value);
1277 ret.vec.ps[0] = __PS_NEG(value.vec.ps[0]);
1278 ret.vec.ps[1] = __PS_NEG(value.vec.ps[1]);
1285 #ifdef NLIB_F128_SIMD_NOUSE 1287 ret.vec.v[0] = a.vec.v[0] * b.vec.v[0];
1288 ret.vec.v[1] = a.vec.v[1] * b.vec.v[1];
1289 ret.vec.v[2] = a.vec.v[2] * b.vec.v[2];
1290 ret.vec.v[3] = a.vec.v[3] * b.vec.v[3];
1292 #elif defined(NLIB_SSE41) 1293 return _mm_mul_ps(a, b);
1294 #elif defined(NLIB_NEON) 1295 return vmulq_f32(a, b);
1298 ret.vec.ps[0] = __PS_MUL(a.vec.ps[0], b.vec.ps[0]);
1299 ret.vec.ps[1] = __PS_MUL(a.vec.ps[1], b.vec.ps[1]);
1306 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1307 return vmulq_n_f32(b, a);
1308 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1310 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], a);
1311 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], a);
1314 return F128::Mult(b, F128::SetValue(a,
each_float));
1321 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1323 return vmulq_laneq_f32(b, a, N);
1325 float tmp = vget_lane_f32((N < 2 ? vget_low_f32(a) : vget_high_f32(a)), (N & 1));
1326 return vmulq_n_f32(b, tmp);
1328 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1329 float t = a.vec.ps[N / 2][N % 2];
1331 ret.vec.ps[0] = __PS_MULS0F(b.vec.ps[0], t);
1332 ret.vec.ps[1] = __PS_MULS0F(b.vec.ps[1], t);
1341 #ifdef NLIB_F128_SIMD_NOUSE 1343 ret.vec.v[0] = a.vec.v[0] / b.vec.v[0];
1344 ret.vec.v[1] = a.vec.v[1] / b.vec.v[1];
1345 ret.vec.v[2] = a.vec.v[2] / b.vec.v[2];
1346 ret.vec.v[3] = a.vec.v[3] / b.vec.v[3];
1348 #elif defined(NLIB_SSE41) 1349 return _mm_div_ps(a, b);
1350 #elif defined(NLIB_NEON) 1352 return vdivq_f32(a, b);
1354 float32x4_t inv0 = vrecpeq_f32(b);
1355 float32x4_t step0 = vrecpsq_f32(inv0, b);
1356 float32x4_t inv1 = vmulq_f32(step0, inv0);
1357 float32x4_t step1 = vrecpsq_f32(inv1, b);
1358 float32x4_t inv2 = vmulq_f32(step1, inv1);
1359 uint32x4_t zeromask = vceqq_f32(b, vdupq_n_f32(0));
1360 inv2 = vbslq_f32(zeromask, F128::SetInfinity(), inv2);
1361 return vmulq_f32(a, inv2);
1365 ret.vec.ps[0] = __PS_DIV(a.vec.ps[0], b.vec.ps[0]);
1366 ret.vec.ps[1] = __PS_DIV(a.vec.ps[1], b.vec.ps[1]);
1373 #ifdef NLIB_F128_SIMD_NOUSE 1375 ret.vec.v[0] = a.vec.v[0] > b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1376 ret.vec.v[1] = a.vec.v[1] > b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1377 ret.vec.v[2] = a.vec.v[2] > b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1378 ret.vec.v[3] = a.vec.v[3] > b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1380 #elif defined(NLIB_SSE41) 1381 return _mm_max_ps(a, b);
1382 #elif defined(NLIB_NEON) 1383 return vmaxq_f32(a, b);
1385 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1386 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1388 ret.vec.ps[0] = __PS_SEL(cmp0, a.vec.ps[0], b.vec.ps[0]);
1389 ret.vec.ps[1] = __PS_SEL(cmp1, a.vec.ps[1], b.vec.ps[1]);
1396 #ifdef NLIB_F128_SIMD_NOUSE 1398 ret.vec.v[0] = a.vec.v[0] < b.vec.v[0] ? a.vec.v[0] : b.vec.v[0];
1399 ret.vec.v[1] = a.vec.v[1] < b.vec.v[1] ? a.vec.v[1] : b.vec.v[1];
1400 ret.vec.v[2] = a.vec.v[2] < b.vec.v[2] ? a.vec.v[2] : b.vec.v[2];
1401 ret.vec.v[3] = a.vec.v[3] < b.vec.v[3] ? a.vec.v[3] : b.vec.v[3];
1403 #elif defined(NLIB_SSE41) 1404 return _mm_min_ps(a, b);
1405 #elif defined(NLIB_NEON) 1406 return vminq_f32(a, b);
1408 f32x2 cmp0 = __PS_SUB(a.vec.ps[0], b.vec.ps[0]);
1409 f32x2 cmp1 = __PS_SUB(a.vec.ps[1], b.vec.ps[1]);
1411 ret.vec.ps[0] = __PS_SEL(cmp0, b.vec.ps[0], a.vec.ps[0]);
1412 ret.vec.ps[1] = __PS_SEL(cmp1, b.vec.ps[1], a.vec.ps[1]);
1419 #ifdef NLIB_F128_SIMD_NOUSE 1421 ret.vec.v[0] = a.vec.v[0] > a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1422 ret.vec.v[1] = a.vec.v[2] > a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1423 ret.vec.v[2] = b.vec.v[0] > b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1424 ret.vec.v[3] = b.vec.v[2] > b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1426 #elif defined(NLIB_SSE41) 1427 f128 ax = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1428 f128 bx = _mm_max_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1429 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1430 #elif defined(NLIB_NEON) 1432 return vpmaxq_f32(a, b);
1434 float32x2_t rl = vpmax_f32(vget_low_f32(a), vget_high_f32(a));
1435 float32x2_t rh = vpmax_f32(vget_low_f32(b), vget_high_f32(b));
1436 return vcombine_f32(rl, rh);
1439 f32x2 v02, v13, cmp;
1441 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1442 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1443 cmp = __PS_SUB(v02, v13);
1444 ret.vec.ps[0] = __PS_SEL(cmp, v02, v13);
1445 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1446 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1447 cmp = __PS_SUB(v02, v13);
1448 ret.vec.ps[1] = __PS_SEL(cmp, v02, v13);
1455 #ifdef NLIB_F128_SIMD_NOUSE 1457 ret.vec.v[0] = a.vec.v[0] < a.vec.v[1] ? a.vec.v[0] : a.vec.v[1];
1458 ret.vec.v[1] = a.vec.v[2] < a.vec.v[3] ? a.vec.v[2] : a.vec.v[3];
1459 ret.vec.v[2] = b.vec.v[0] < b.vec.v[1] ? b.vec.v[0] : b.vec.v[1];
1460 ret.vec.v[3] = b.vec.v[2] < b.vec.v[3] ? b.vec.v[2] : b.vec.v[3];
1462 #elif defined(NLIB_SSE41) 1463 f128 ax = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)));
1464 f128 bx = _mm_min_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)));
1465 return _mm_shuffle_ps(ax, bx, _MM_SHUFFLE(2, 0, 2, 0));
1466 #elif defined(NLIB_NEON) 1468 return vpminq_f32(a, b);
1470 float32x2_t rl = vpmin_f32(vget_low_f32(a), vget_high_f32(a));
1471 float32x2_t rh = vpmin_f32(vget_low_f32(b), vget_high_f32(b));
1472 return vcombine_f32(rl, rh);
1475 f32x2 v02, v13, cmp;
1477 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1478 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1479 cmp = __PS_SUB(v02, v13);
1480 ret.vec.ps[0] = __PS_SEL(cmp, v13, v02);
1481 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1482 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1483 cmp = __PS_SUB(v02, v13);
1484 ret.vec.ps[1] = __PS_SEL(cmp, v13, v02);
1491 #ifdef NLIB_F128_SIMD_NOUSE 1493 ret.vec.v[0] = a.vec.v[0] + a.vec.v[1];
1494 ret.vec.v[1] = a.vec.v[2] + a.vec.v[3];
1495 ret.vec.v[2] = b.vec.v[0] + b.vec.v[1];
1496 ret.vec.v[3] = b.vec.v[2] + b.vec.v[3];
1498 #elif defined(NLIB_SSE41) 1499 return _mm_hadd_ps(a, b);
1500 #elif defined(NLIB_NEON) 1502 return vpaddq_f32(a, b);
1504 float32x2_t al = vget_low_f32(a);
1505 float32x2_t ah = vget_high_f32(a);
1506 float32x2_t l = vpadd_f32(al, ah);
1508 float32x2_t bl = vget_low_f32(b);
1509 float32x2_t bh = vget_high_f32(b);
1510 float32x2_t h = vpadd_f32(bl, bh);
1511 return vcombine_f32(l, h);
1514 f32x2 v02, v13, cmp;
1516 v02 = __PS_MERGE00(a.vec.ps[0], a.vec.ps[1]);
1517 v13 = __PS_MERGE11(a.vec.ps[0], a.vec.ps[1]);
1518 ret.vec.ps[0] = __PS_ADD(v02, v13);
1519 v02 = __PS_MERGE00(b.vec.ps[0], b.vec.ps[1]);
1520 v13 = __PS_MERGE11(b.vec.ps[0], b.vec.ps[1]);
1521 ret.vec.ps[1] = __PS_ADD(v02, v13);
1528 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1529 return vabdq_f32(a, b);
1531 return F128::Abs(F128::Sub(a, b));
1537 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1539 return vfmaq_f32(c, a, b);
1541 return vmlaq_f32(c, a, b);
1543 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1545 ret.vec.ps[0] = __PS_MADD(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1546 ret.vec.ps[1] = __PS_MADD(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1549 return F128::Add(c, F128::Mult(a, b));
1555 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1557 return vfmaq_n_f32(c, b, a);
1559 return vmlaq_n_f32(c, b, a);
1562 return F128::MultAdd(F128::SetValue(a,
each_float), b, c);
1570 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1572 return vfmaq_laneq_f32(c, b, a, N);
1574 return vmlaq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1577 return F128::MultAdd(F128::SetValue<N>(a,
each_select32), b, c);
1583 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1585 return vfmsq_f32(c, a, b);
1587 return vmlsq_f32(c, a, b);
1589 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 1591 ret.vec.ps[0] = __PS_NMSUB(a.vec.ps[0], b.vec.ps[0], c.vec.ps[0]);
1592 ret.vec.ps[1] = __PS_NMSUB(a.vec.ps[1], b.vec.ps[1], c.vec.ps[1]);
1595 return F128::Sub(c, F128::Mult(a, b));
1601 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1603 return vfmsq_n_f32(c, b, a);
1605 return vmlsq_n_f32(c, b, a);
1608 return F128::MultSub(F128::SetValue(a,
each_float), b, c);
1616 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1618 return vfmsq_laneq_f32(c, b, a, N);
1620 return vmlsq_lane_f32(c, b, N < 2 ? vget_low_f32(a) : vget_high_f32(a), (N & 1));
1623 return F128::MultSub(F128::SetValue<N>(a,
each_select32), b, c);
1630 return F128::MultAdd(t, F128::Sub(b, a), a);
1635 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1637 ret.vec.u[0] = a.vec.u[0] & b.vec.u[0];
1638 ret.vec.u[1] = a.vec.u[1] & b.vec.u[1];
1639 ret.vec.u[2] = a.vec.u[2] & b.vec.u[2];
1640 ret.vec.u[3] = a.vec.u[3] & b.vec.u[3];
1642 #elif defined(NLIB_SSE41) 1643 return _mm_and_ps(a, b);
1644 #elif defined(NLIB_NEON) 1645 uint32x4_t tmp = vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1646 return vreinterpretq_f32_u32(tmp);
1654 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1657 f128 sum = F128::Add(angle1, angle2);
1659 f128 ofs = F128::And(cond, pi_dbl);
1660 f128 result = F128::Add(sum, ofs);
1661 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1662 ofs = F128::And(cond, pi_dbl);
1663 return F128::Sub(result, ofs);
1670 f128 pi_pi2 = F128::LoadA16(F128::pi_values_);
1673 f128 sum = F128::Sub(angle1, angle2);
1675 f128 ofs = F128::And(cond, pi_dbl);
1676 f128 result = F128::Add(sum, ofs);
1677 cond = F128::CmpGe(sum, F128::SetValue<0>(pi_pi2,
each_select32));
1678 ofs = F128::And(cond, pi_dbl);
1679 return F128::Sub(result, ofs);
1693 f128 tt = F128::Mult(t, t);
1694 f128 ttt = F128::Mult(tt, t);
1695 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1697 vcombine_f32(vcreate_f32(0x3F80000040000000ULL), vcreate_f32(0x3F800000C0000000ULL));
1699 vcombine_f32(vcreate_f32(0xC0000000C0400000ULL), vcreate_f32(0xBF80000040400000ULL));
1701 f128 hermite_R0 = F128::LoadA16(hermite_R0_);
1702 f128 hermite_R1 = F128::LoadA16(hermite_R1_);
1705 ttt = F128::Mult(ttt, hermite_R0);
1706 ttt = F128::MultAdd(tt, hermite_R1, ttt);
1707 ttt = F128::MultAdd(t, F128::Set0100(), ttt);
1708 ttt = F128::Add(ttt, F128::Set1000());
1724 f128 tt = F128::Mult(t, t);
1725 f128 ttt = F128::Mult(tt, t);
1726 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1728 vcombine_f32(vcreate_f32(0x40400000BF800000ULL), vcreate_f32(0x3F800000C0400000ULL));
1730 vcombine_f32(vcreate_f32(0xC0A0000040000000ULL), vcreate_f32(0xBF80000040800000ULL));
1732 vcombine_f32(vcreate_f32(0x00000000BF800000ULL), vcreate_f32(0x000000003F800000ULL));
1734 f128 catmull_R0 = F128::LoadA16(catmull_R0_);
1735 f128 catmull_R1 = F128::LoadA16(catmull_R1_);
1736 f128 catmull_R2 = F128::LoadA16(catmull_R2_);
1738 ttt = F128::Mult(ttt, catmull_R0);
1739 ttt = F128::MultAdd(tt, catmull_R1, ttt);
1740 ttt = F128::MultAdd(t, catmull_R2, ttt);
1741 ttt = F128::Add(ttt, F128::Set0100());
1755 f128 p1p0 = F128::Sub(p1, p0);
1756 f128 p2p0 = F128::Sub(p2, p0);
1757 f128 tmp = F128::MultAdd(f, p1p0, p0);
1758 return F128::MultAdd(g, p2p0, tmp);
1763 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1765 ret.vec.u[0] = a.vec.u[0] | b.vec.u[0];
1766 ret.vec.u[1] = a.vec.u[1] | b.vec.u[1];
1767 ret.vec.u[2] = a.vec.u[2] | b.vec.u[2];
1768 ret.vec.u[3] = a.vec.u[3] | b.vec.u[3];
1770 #elif defined(NLIB_SSE41) 1771 return _mm_or_ps(a, b);
1772 #elif defined(NLIB_NEON) 1773 uint32x4_t tmp = vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1774 return vreinterpretq_f32_u32(tmp);
1780 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1782 ret.vec.u[0] = a.vec.u[0] ^ b.vec.u[0];
1783 ret.vec.u[1] = a.vec.u[1] ^ b.vec.u[1];
1784 ret.vec.u[2] = a.vec.u[2] ^ b.vec.u[2];
1785 ret.vec.u[3] = a.vec.u[3] ^ b.vec.u[3];
1787 #elif defined(NLIB_SSE41) 1788 return _mm_xor_ps(a, b);
1789 #elif defined(NLIB_NEON) 1790 uint32x4_t tmp = veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b));
1791 return vreinterpretq_f32_u32(tmp);
1797 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1799 ret.vec.u[0] = ~a.vec.u[0];
1800 ret.vec.u[1] = ~a.vec.u[1];
1801 ret.vec.u[2] = ~a.vec.u[2];
1802 ret.vec.u[3] = ~a.vec.u[3];
1804 #elif defined(NLIB_SSE41) 1805 return _mm_andnot_ps(a, F128::CmpEq(a, a));
1806 #elif defined(NLIB_NEON) 1807 uint32x4_t tmp = vmvnq_u32(vreinterpretq_u32_f32(a));
1808 return vreinterpretq_f32_u32(tmp);
1814 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1816 ret.vec.u[0] = ~a.vec.u[0] & b.vec.u[0];
1817 ret.vec.u[1] = ~a.vec.u[1] & b.vec.u[1];
1818 ret.vec.u[2] = ~a.vec.u[2] & b.vec.u[2];
1819 ret.vec.u[3] = ~a.vec.u[3] & b.vec.u[3];
1821 #elif defined(NLIB_SSE41) 1822 return _mm_andnot_ps(a, b);
1823 #elif defined(NLIB_NEON) 1824 uint32x4_t tmp = vbicq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1825 return vreinterpretq_f32_u32(tmp);
1831 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1833 ret.vec.u[0] = ~a.vec.u[0] | b.vec.u[0];
1834 ret.vec.u[1] = ~a.vec.u[1] | b.vec.u[1];
1835 ret.vec.u[2] = ~a.vec.u[2] | b.vec.u[2];
1836 ret.vec.u[3] = ~a.vec.u[3] | b.vec.u[3];
1838 #elif defined(NLIB_SSE41) 1839 return _mm_or_ps(F128::Not(a), b);
1840 #elif defined(NLIB_NEON) 1841 uint32x4_t tmp = vornq_u32(vreinterpretq_u32_f32(b), vreinterpretq_u32_f32(a));
1842 return vreinterpretq_f32_u32(tmp);
1848 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 1850 ret.vec.u[0] = (a.vec.v[0] == b.vec.v[0]) ? 0xFFFFFFFFUL : 0;
1851 ret.vec.u[1] = (a.vec.v[1] == b.vec.v[1]) ? 0xFFFFFFFFUL : 0;
1852 ret.vec.u[2] = (a.vec.v[2] == b.vec.v[2]) ? 0xFFFFFFFFUL : 0;
1853 ret.vec.u[3] = (a.vec.v[3] == b.vec.v[3]) ? 0xFFFFFFFFUL : 0;
1855 #elif defined(NLIB_SSE41) 1856 return _mm_cmpeq_ps(a, b);
1857 #elif defined(NLIB_NEON) 1858 uint32x4_t tmp = vceqq_f32(a, b);
1859 return vreinterpretq_f32_u32(tmp);
1865 f128 tmp = F128::AbsDiff(a, b);
1866 return F128::CmpLe(tmp, eps);
1871 return F128::Min(max, F128::Max(min, value));
1876 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 1877 uint32x4_t tmp = vcaleq_f32(value, bounds);
1878 return vreinterpretq_f32_u32(tmp);
1880 return F128::CmpLe(F128::Abs(value), bounds);
1885 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1886 return vreinterpretq_f32_u32(vceqzq_f32(value));
1888 return F128::CmpEq(value, F128::SetZero());
1893 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1894 return vreinterpretq_f32_u32(vcltzq_f32(value));
1896 return F128::CmpLt(value, F128::SetZero());
1901 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1902 return vreinterpretq_f32_u32(vclezq_f32(value));
1904 return F128::CmpLe(value, F128::SetZero());
1909 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1910 return vreinterpretq_f32_u32(vcgtzq_f32(value));
1912 return F128::CmpGt(value, F128::SetZero());
1917 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1918 return vreinterpretq_f32_u32(vcgezq_f32(value));
1920 return F128::CmpGe(value, F128::SetZero());
1925 #if defined(__aarch64__) && !defined(NLIB_F128_SIMD_NOUSE) 1926 return vreinterpretq_f32_u32(vmvnq_u32(vceqzq_f32(value)));
1928 return F128::CmpNe(value, F128::SetZero());
1934 f128 tmp = F128::Abs(value);
1935 return F128::CmpLe(tmp, eps);
1940 #ifdef NLIB_F128_SIMD_NOUSE 1942 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1943 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1944 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1945 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1947 #elif defined(NLIB_SSE41) 1948 return _mm_div_ps(F128::SetOne(), value);
1949 #elif defined(NLIB_NEON) 1951 return vdivq_f32(vdupq_n_f32(1.f), value);
1954 x = vrecpeq_f32(value);
1955 x = vmulq_f32(x, vrecpsq_f32(x, value));
1956 x = vmulq_f32(x, vrecpsq_f32(x, value));
1957 uint32x4_t zeromask = vceqq_f32(value, vdupq_n_f32(0));
1958 float32x4_t result = vbslq_f32(zeromask, F128::SetInfinity(), x);
1962 return F128::Div(F128::SetOne(), value);
1968 #ifdef NLIB_F128_SIMD_NOUSE 1970 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / value.vec.v[0] : INFINITY;
1971 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / value.vec.v[1] : INFINITY;
1972 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / value.vec.v[2] : INFINITY;
1973 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / value.vec.v[3] : INFINITY;
1975 #elif defined(NLIB_SSE41) 1976 return _mm_rcp_ps(value);
1977 #elif defined(NLIB_NEON) 1978 return vrecpeq_f32(value);
1981 ret.vec.ps[0] = __PS_RES(value.vec.ps[0]);
1982 ret.vec.ps[1] = __PS_RES(value.vec.ps[1]);
1989 #ifdef NLIB_F128_SIMD_NOUSE 1991 ret.vec.v[0] = sqrtf(value.vec.v[0]);
1992 ret.vec.v[1] = sqrtf(value.vec.v[1]);
1993 ret.vec.v[2] = sqrtf(value.vec.v[2]);
1994 ret.vec.v[3] = sqrtf(value.vec.v[3]);
1996 #elif defined(NLIB_SSE41) 1997 return _mm_sqrt_ps(value);
1998 #elif defined(NLIB_NEON) 1999 f128 iszero = F128::CmpEqZero(value);
2000 f128 result = F128::Mult(value, F128::RecpSqrt(value));
2001 return F128::AndNot(iszero, result);
2003 f128 zero = F128::SetZero();
2004 f128 iszero = F128::CmpEq(zero, value);
2005 f128 result = F128::Mult(value, F128::RecpSqrt(value));
2006 return F128::Select(iszero, zero, result);
2012 #ifdef NLIB_F128_SIMD_NOUSE 2014 ret.vec.v[0] = sqrtf(value.vec.v[0]);
2015 ret.vec.v[1] = sqrtf(value.vec.v[1]);
2016 ret.vec.v[2] = sqrtf(value.vec.v[2]);
2017 ret.vec.v[3] = sqrtf(value.vec.v[3]);
2019 #elif defined(NLIB_SSE41) 2020 return _mm_sqrt_ps(value);
2021 #elif defined(NLIB_NEON) 2022 return vrecpeq_f32(vrsqrteq_f32(value));
2025 ret.vec.ps[0] = __PS_RES(__PS_RSQRTE(value.vec.ps[0]));
2026 ret.vec.ps[1] = __PS_RES(__PS_RSQRTE(value.vec.ps[1]));
2033 #ifdef NLIB_F128_SIMD_NOUSE 2035 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2036 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2037 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2038 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2040 #elif defined(NLIB_SSE41) 2041 return _mm_div_ps(F128::SetOne(), F128::Sqrt(value));
2042 #elif defined(NLIB_NEON) 2044 x = vrsqrteq_f32(value);
2045 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2046 x = vmulq_f32(x, vrsqrtsq_f32(value, vmulq_f32(x, x)));
2047 f128 zeromask = F128::CmpEqZero(value);
2048 return F128::Select(zeromask, F128::SetInfinity(), x);
2050 f32x2 three = __PS_FDUP(3.f);
2051 f32x2 half = __PS_FDUP(0.5f);
2057 v = value.vec.ps[0];
2060 xx = __PS_MUL(x, x);
2061 xx = __PS_NMSUB(v, xx, three);
2062 xx = __PS_MUL(x, xx);
2063 x = __PS_MUL(half, xx);
2065 xx = __PS_MUL(x, x);
2066 xx = __PS_NMSUB(v, xx, three);
2067 xx = __PS_MUL(x, xx);
2068 ret.vec.ps[0] = __PS_MUL(half, xx);
2070 v = value.vec.ps[1];
2073 xx = __PS_MUL(x, x);
2074 xx = __PS_NMSUB(v, xx, three);
2075 xx = __PS_MUL(x, xx);
2076 x = __PS_MUL(half, xx);
2078 xx = __PS_MUL(x, x);
2079 xx = __PS_NMSUB(v, xx, three);
2080 xx = __PS_MUL(x, xx);
2081 ret.vec.ps[1] = __PS_MUL(half, xx);
2083 f128 iszero = F128::CmpEq(F128::SetZero(), value);
2084 f128 inf = F128::SetInfinity();
2085 return F128::Select(iszero, inf, ret);
2091 #ifdef NLIB_F128_SIMD_NOUSE 2093 ret.vec.v[0] = (value.vec.v[0] != 0.f) ? 1.f / sqrtf(value.vec.v[0]) : INFINITY;
2094 ret.vec.v[1] = (value.vec.v[1] != 0.f) ? 1.f / sqrtf(value.vec.v[1]) : INFINITY;
2095 ret.vec.v[2] = (value.vec.v[2] != 0.f) ? 1.f / sqrtf(value.vec.v[2]) : INFINITY;
2096 ret.vec.v[3] = (value.vec.v[3] != 0.f) ? 1.f / sqrtf(value.vec.v[3]) : INFINITY;
2098 #elif defined(NLIB_SSE41) 2099 return _mm_rsqrt_ps(value);
2100 #elif defined(NLIB_NEON) 2101 return vrsqrteq_f32(value);
2104 ret.vec.ps[0] = __PS_RSQRTE(value.vec.ps[0]);
2105 ret.vec.ps[1] = __PS_RSQRTE(value.vec.ps[1]);
2110 template<
bool NegateLane0,
bool NegateLane1,
bool NegateLane2,
bool NegateLane3>
2113 const size_t lane0 = NegateLane0 ? 4 : 0;
2114 const size_t lane1 = NegateLane1 ? 5 : 1;
2115 const size_t lane2 = NegateLane2 ? 6 : 2;
2116 const size_t lane3 = NegateLane3 ? 7 : 3;
2117 return F128::Permute<lane0, lane1, lane2, lane3>(value, F128::Negate(value));
2129 return F128::Negate(value);
2132 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2133 #define NLIB_ISNAN(vec, idx) \ 2134 ((vec.u[idx] & 0x7F800000U) == 0x7F800000U && (vec.u[idx] & 0x7FFFFFU) != 0) 2135 #define NLIB_ISINF(vec, idx) ((vec.u[idx] & 0x7FFFFFFFU) == 0x7F800000U) 2140 #if defined(NLIB_F128_SIMD_NOUSE) 2142 ret.vec.u[0] = NLIB_ISNAN(value.vec, 0) ? 0xFFFFFFFFU : 0;
2143 ret.vec.u[1] = NLIB_ISNAN(value.vec, 1) ? 0xFFFFFFFFU : 0;
2144 ret.vec.u[2] = NLIB_ISNAN(value.vec, 2) ? 0xFFFFFFFFU : 0;
2145 ret.vec.u[3] = NLIB_ISNAN(value.vec, 3) ? 0xFFFFFFFFU : 0;
2149 f32x2 one = __PS_FDUP(1.f);
2150 f32x2 minus_one = __PS_NEG(one);
2151 f32x2 v0 = value.vec.ps[0];
2152 f32x2 v1 = value.vec.ps[1];
2153 f32x2 t0 = __PS_SEL(v0, one, minus_one);
2154 f32x2 t1 = __PS_SEL(v1, one, minus_one);
2156 f32x2 v0neg = __PS_NEG(v0);
2157 f32x2 v1neg = __PS_NEG(v1);
2158 ret.vec.ps[0] = __PS_SEL(v0neg, one, t0);
2159 ret.vec.ps[1] = __PS_SEL(v1neg, one, t0);
2162 return F128::CmpNe(value, value);
2168 #if defined(NLIB_F128_SIMD_NOUSE) 2170 ret.vec.u[0] = NLIB_ISINF(value.vec, 0) ? 0xFFFFFFFFU : 0;
2171 ret.vec.u[1] = NLIB_ISINF(value.vec, 1) ? 0xFFFFFFFFU : 0;
2172 ret.vec.u[2] = NLIB_ISINF(value.vec, 2) ? 0xFFFFFFFFU : 0;
2173 ret.vec.u[3] = NLIB_ISINF(value.vec, 3) ? 0xFFFFFFFFU : 0;
2177 f32x2 big_value = __PS_FDUP(FLT_MAX);
2178 ret.vec.ps[0] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[0]));
2179 ret.vec.ps[1] = __PS_SUB(big_value, __PS_ABS(value.vec.ps[1]));
2182 f128 inf_value = F128::SetInfinity();
2183 f128 abs_value = F128::Abs(value);
2184 return F128::CmpEq(inf_value, abs_value);
2190 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 2191 return _mm_round_ps(value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2192 #elif defined(NLIB_NEON) && __ARM_ARCH >= 8 && !defined(NLIB_F128_SIMD_NOUSE) 2193 return vrndaq_f32(value);
2196 f128 sgn = F128::And(value, F128::SetSignMask());
2198 f128 result = F128::Sub(F128::Add(value, sm), sm);
2207 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2209 for (
size_t i = 0; i < 4; ++i) {
2210 if (NLIB_ISNAN(value.vec, i)) {
2211 ret.vec.u[i] = 0x7FC00000U;
2213 ret.vec.v[i] = (fabsf(value.vec.v[i]) < 8388608.f)
2214 ? static_cast<float>(static_cast<int>(value.vec.v[i]))
2219 #elif defined(NLIB_SSE41) 2220 return _mm_round_ps(value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2221 #elif defined(NLIB_NEON) 2223 f128 x = F128::Abs(value);
2225 f128 cond = F128::CmpLt(x, c_2_23);
2226 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2227 return F128::Select(cond, casted, value);
2229 return vrndq_f32(value);
2236 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2238 ret.vec.v[0] = floorf(value.vec.v[0]);
2239 ret.vec.v[1] = floorf(value.vec.v[1]);
2240 ret.vec.v[2] = floorf(value.vec.v[2]);
2241 ret.vec.v[3] = floorf(value.vec.v[3]);
2243 #elif defined(NLIB_SSE41) 2244 return _mm_floor_ps(value);
2245 #elif defined(NLIB_NEON) 2249 f128 x = F128::Abs(value);
2251 f128 cond = F128::CmpLt(x, c_2_23);
2252 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2255 f128 large_mask = F128::CmpGt(casted, value);
2257 casted = F128::Add(casted, F128::ConvertFromI128(F128::CastToI128(large_mask)));
2258 return F128::Select(cond, casted, value);
2260 return vrndmq_f32(value);
2267 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2269 ret.vec.v[0] = ceilf(value.vec.v[0]);
2270 ret.vec.v[1] = ceilf(value.vec.v[1]);
2271 ret.vec.v[2] = ceilf(value.vec.v[2]);
2272 ret.vec.v[3] = ceilf(value.vec.v[3]);
2274 #elif defined(NLIB_SSE41) 2275 return _mm_ceil_ps(value);
2276 #elif defined(NLIB_NEON) 2280 f128 x = F128::Abs(value);
2282 f128 cond = F128::CmpLt(x, c_2_23);
2283 f128 casted = F128::ConvertFromI128(F128::ConvertToI128Truncate(value));
2286 f128 small_mask = F128::CmpLt(casted, value);
2288 casted = F128::Sub(casted, F128::ConvertFromI128(F128::CastToI128(small_mask)));
2289 return F128::Select(cond, casted, value);
2291 return vrndpq_f32(value);
2296 #ifdef NLIB_F128_SIMD_NOUSE 2303 return F128::Clamp(value, F128::SetZero(), F128::SetOne());
2307 static const float v_1_2pi = 0.15915494309189535f;
2308 static const float v_2pi = 6.283185307179586f;
2311 f128 round = F128::Round(F128::Mult(value, recp_two_pi));
2313 return F128::MultSub(two_pi, round, value);
2318 f128 x = F128::ModAngle(value);
2325 f128 sin_cvalue = F128::LoadA16(F128::sin_cvalue_);
2329 f128 xabs = F128::Abs(value);
2330 f128 xsign = F128::And(F128::SetSignMask(), x);
2331 f128 mypi = F128::Or(xsign, pi);
2332 f128 pi_x = F128::Sub(mypi, x);
2333 f128 cond = F128::CmpLe(xabs, pidiv2);
2334 x = F128::Select(cond, x, pi_x);
2336 f128 xx = F128::Mult(x, x);
2337 f128 coeff = F128::LoadA16(sin_coeff_);
2341 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2342 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2343 result = F128::MultSub(xx, result, F128::SetValue<2>(sin_cvalue,
each_select32));
2344 result = F128::MultSub(xx, result, F128::SetValue<3>(sin_cvalue,
each_select32));
2345 result = F128::Mult(xx, result);
2346 result = F128::MultSub(result, x, x);
2352 f128 x = F128::ModAngle(value);
2359 f128 cvalue = F128::LoadA16(cos_cvalue_);
2361 f128 xabs = F128::Abs(value);
2362 f128 xsign = F128::And(F128::SetSignMask(), x);
2364 f128 pi_x = F128::Sub(mypi, x);
2366 x = F128::Select(cond, x, pi_x);
2369 f128 sign = F128::AndNot(cond, F128::SetSignMask());
2373 f128 xx = F128::Mult(x, x);
2374 f128 coeff = F128::LoadA16(cos_coeff_);
2378 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2379 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2380 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2381 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2382 result = F128::MultSub(xx, result, F128::SetOne());
2383 result = F128::Xor(sign, result);
2389 const f128 signmask = F128::SetSignMask();
2390 f128 x = F128::ModAngle(value);
2397 f128 cvalue = F128::LoadA16(cos_cvalue_);
2399 f128 xabs = F128::Abs(value);
2400 f128 xsign = F128::And(signmask, x);
2402 f128 pi_x = F128::Sub(mypi, x);
2404 x = F128::Select(cond, x, pi_x);
2407 f128 sign = F128::AndNot(cond, signmask);
2411 f128 xx = F128::Mult(x, x);
2416 f128 coeff = F128::LoadA16(cos_coeff_);
2421 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2422 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2423 result = F128::MultSub(xx, result, F128::SetValue<2>(cvalue,
each_select32));
2424 result = F128::MultSub(xx, result, F128::SetValue<3>(cvalue,
each_select32));
2425 result = F128::MultSub(xx, result, F128::SetOne());
2427 ret.val[1] = F128::Xor(sign, result);
2432 f128 coeff = F128::LoadA16(sin_coeff_);
2437 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff,
each_select32));
2438 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff,
each_select32));
2439 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[2],
each_float));
2440 result = F128::MultSub(xx, result, F128::SetValue(sin_cvalue_[3],
each_float));
2441 result = F128::Mult(xx, result);
2442 ret.val[0] = F128::MultSub(result, x, x);
2451 f128 cmp, value_sign;
2453 f128 one = F128::SetOne();
2458 value_sign = F128::AndNot(F128::CmpGt(value, one), F128::SetSignMask());
2459 cmp = F128::CmpLe(F128::Abs(value), one);
2461 f128 x = F128::Select(cmp, value, F128::Recp(value));
2468 f128 coeff0 = F128::LoadA16(&atan_coeff_[0]);
2469 f128 coeff1 = F128::LoadA16(&atan_coeff_[4]);
2470 f128 xx = F128::Mult(x, x);
2473 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff1,
each_select32));
2474 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff1,
each_select32));
2475 result = F128::MultSub(xx, result, F128::SetValue<3>(coeff0,
each_select32));
2476 result = F128::MultSub(xx, result, F128::SetValue<2>(coeff0,
each_select32));
2477 result = F128::MultSub(xx, result, F128::SetValue<1>(coeff0,
each_select32));
2478 result = F128::MultSub(xx, result, F128::SetValue<0>(coeff0,
each_select32));
2480 result = F128::Mult(result, x);
2481 result = F128::MultSub(xx, result, x);
2484 f128 result_another = F128::Sub(F128::Xor(value_sign, pi_2), result);
2485 result = F128::Select(cmp, result, result_another);
2506 const f128 signmask = F128::SetSignMask();
2508 const f128 sy = F128::And(y, signmask);
2509 const f128 infx = F128::IsInfinite(x);
2510 const f128 infy = F128::IsInfinite(y);
2511 const f128 zerox = F128::CmpEqZero(x);
2512 const f128 zeroy = F128::CmpEqZero(y);
2513 const f128 posx = F128::CmpGtZero(x);
2523 const f128 cval = F128::LoadA16(atan2_cvalue_);
2529 f128 v = F128::Select(infy, F128::Select(infx, F128::Select(posx, pi_4, pi_34), pi_2),
2530 F128::Select(zeroy, F128::AndNot(posx, pi), F128::OrNot(zerox, pi_2)));
2535 #if defined(NLIB_F128_SIMD_NOUSE) 2537 mask.vec.u[0] = v.vec.u[0] == 0xFFFFFFFFU ? v.vec.u[0] : 0;
2538 mask.vec.u[1] = v.vec.u[1] == 0xFFFFFFFFU ? v.vec.u[1] : 0;
2539 mask.vec.u[2] = v.vec.u[2] == 0xFFFFFFFFU ? v.vec.u[2] : 0;
2540 mask.vec.u[3] = v.vec.u[3] == 0xFFFFFFFFU ? v.vec.u[3] : 0;
2544 mask.vec.ps[0][0] = v.vec.u[0] == 0xFF7FFFFFUL ? -1.f : 1.f;
2545 mask.vec.ps[0][1] = v.vec.u[1] == 0xFF7FFFFFUL ? -1.f : 1.f;
2546 mask.vec.ps[1][0] = v.vec.u[2] == 0xFF7FFFFFUL ? -1.f : 1.f;
2547 mask.vec.ps[1][1] = v.vec.u[3] == 0xFF7FFFFFUL ? -1.f : 1.f;
2550 F128::CastFromI128(I128::CmpEq32(F128::CastToI128(v), I128::SetValue(-1,
each_int8)));
2552 f128 result = F128::Add(F128::ArcTan(F128::Div(y, x)), F128::AndNot(posx, pi));
2553 return F128::Select(mask, result, v);
2558 f128 one = F128::SetOne();
2559 f128 tmp = F128::MultSub(value, value, one);
2560 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2561 return F128::ArcTan2(value, argx);
2566 f128 one = F128::SetOne();
2567 f128 tmp = F128::MultSub(value, value, one);
2568 f128 argx = F128::Sqrt(F128::AndNot(F128::CmpLtZero(tmp), tmp));
2569 return F128::ArcTan2(argx, value);
2574 #ifdef NLIB_F128_SIMD_NOUSE 2576 ret |= value.vec.u[0] == 0xFFFFFFFFU ? 1 : 0;
2577 ret |= value.vec.u[1] == 0xFFFFFFFFU ? 2 : 0;
2578 ret |= value.vec.u[2] == 0xFFFFFFFFU ? 4 : 0;
2579 ret |= value.vec.u[3] == 0xFFFFFFFFU ? 8 : 0;
2581 #elif defined(NLIB_SSE41) 2582 return static_cast<uint8_t
>(_mm_movemask_ps(value));
2583 #elif defined(NLIB_NEON) 2584 uint32x2_t powers_lo = vcreate_u32(0x0000000200000001ULL);
2585 uint32x2_t powers_hi = vshl_n_u32(powers_lo, 2);
2586 uint32x4_t powers = vcombine_u32(powers_lo, powers_hi);
2587 uint32x4_t a = vandq_u32(vreinterpretq_u32_f32(value), powers);
2589 return vaddvq_u32(a);
2591 uint16x4_t tmp = vmovn_u32(a);
2592 tmp = vpadd_u16(tmp, tmp);
2593 tmp = vpadd_u16(tmp, tmp);
2594 return vget_lane_u8(vreinterpret_u8_u16(tmp), 0);
2597 int tmp = (value.vec.u[0] >> 31);
2598 tmp |= (value.vec.u[1] >> 30) & 2;
2599 tmp |= (value.vec.u[2] >> 29) & 4;
2600 tmp |= (value.vec.u[3] >> 28) & 8;
2607 #ifdef NLIB_F128_SIMD_NOUSE 2608 return value.vec.u[0] == 0 && value.vec.u[1] == 0 && value.vec.u[2] == 0 && value.vec.u[3] == 0;
2609 #elif defined(NLIB_SSE41) 2610 i128 casted = F128::CastToI128(value);
2611 return _mm_testz_si128(casted, casted) != 0;
2612 #elif defined(NLIB_NEON) 2614 uint32x4_t mask = vceqzq_u32(vreinterpretq_u32_f32(value));
2615 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2617 int32x4_t casted = vreinterpretq_s32_f32(value);
2618 int32x2_t tmp = vorr_s32(vget_low_s32(casted), vget_high_s32(casted));
2619 return vget_lane_u64(vreinterpret_u64_s32(tmp), 0) == 0;
2622 uint32_t tmp = value.vec.u[0] | value.vec.u[1] | value.vec.u[2] | value.vec.u[3];
2623 return (tmp & 0x80000000U) == 0;
2629 #ifdef NLIB_F128_SIMD_NOUSE 2630 return value.vec.u[0] == 0xFFFFFFFFU && value.vec.u[1] == 0xFFFFFFFFU &&
2631 value.vec.u[2] == 0xFFFFFFFFU && value.vec.u[3] == 0xFFFFFFFFU;
2632 #elif defined(NLIB_SSE41) 2633 i128 casted = F128::CastToI128(value);
2634 return _mm_testc_si128(casted, _mm_cmpeq_epi8(casted, casted)) != 0;
2635 #elif defined(NLIB_NEON) 2637 uint32x4_t mask = vceqzq_u32(vmvnq_u32(vreinterpretq_u32_f32(value)));
2638 return vaddvq_s32(vreinterpretq_s32_u32(mask)) == -4;
2640 int32x4_t casted = vreinterpretq_s32_f32(value);
2641 int32x2_t tmp = vand_s32(vget_low_s32(casted), vget_high_s32(casted));
2642 return vget_lane_s64(vreinterpret_s64_s32(tmp), 0) == -1;
2645 uint32_t tmp = value.vec.u[0] & value.vec.u[1] & value.vec.u[2] & value.vec.u[3];
2646 return (tmp & 0x80000000U) != 0;
2654 #ifdef NLIB_F128_SIMD_NOUSE 2655 return value.vec.v[N];
2656 #elif defined(NLIB_SSE41) 2658 _MM_EXTRACT_FLOAT(dest, value, N);
2660 #elif defined(NLIB_NEON) 2661 return vgetq_lane_f32(value, N);
2663 return value.vec.ps[N / 2][N % 2];
2671 #ifdef NLIB_F128_SIMD_NOUSE 2672 return value.vec.u[N];
2673 #elif defined(NLIB_SSE41) 2674 return _mm_extract_ps(value, N);
2675 #elif defined(NLIB_NEON) 2676 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2677 return vgetq_lane_u32(tmp, N);
2679 return value.vec.u[N];
2685 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2686 return value.vec.v[idx];
2687 #elif defined(NLIB_SSE41) 2691 _MM_EXTRACT_FLOAT(dest, value, 0);
2694 _MM_EXTRACT_FLOAT(dest, value, 1);
2697 _MM_EXTRACT_FLOAT(dest, value, 2);
2700 _MM_EXTRACT_FLOAT(dest, value, 3);
2707 #elif defined(NLIB_NEON) 2710 return vgetq_lane_f32(value, 0);
2712 return vgetq_lane_f32(value, 1);
2714 return vgetq_lane_f32(value, 2);
2716 return vgetq_lane_f32(value, 3);
2726 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 2727 return value.vec.u[idx];
2728 #elif defined(NLIB_SSE41) 2731 return static_cast<uint32_t
>(_mm_extract_ps(value, 0));
2733 return static_cast<uint32_t
>(_mm_extract_ps(value, 1));
2735 return static_cast<uint32_t
>(_mm_extract_ps(value, 2));
2737 return static_cast<uint32_t
>(_mm_extract_ps(value, 3));
2742 #elif defined(NLIB_NEON) 2743 uint32x4_t tmp = vreinterpretq_u32_f32(value);
2746 return vgetq_lane_u32(tmp, 0);
2748 return vgetq_lane_u32(tmp, 1);
2750 return vgetq_lane_u32(tmp, 2);
2752 return vgetq_lane_u32(tmp, 3);
2764 #ifdef NLIB_F128_SIMD_NOUSE 2768 #elif defined(NLIB_SSE41) 2769 f128 tmp = _mm_set_ss(v);
2770 return _mm_insert_ps(value, tmp, N << 4);
2771 #elif defined(NLIB_NEON) 2772 return __builtin_constant_p(v) ? F128::Permute < N == 0 ? 4 : 0, N == 1 ? 5 : 1, N == 2 ? 6 : 2,
2773 N == 3 ? 7 : 3 > (value, vdupq_n_f32(v)) : vsetq_lane_f32(v, value, N);
2776 ret.vec.ps[N / 2][N % 2] = v;
2783 #ifdef NLIB_F128_SIMD_NOUSE 2787 #elif defined(NLIB_SSE41) 2788 f128 tmp = _mm_set_ss(v);
2791 return _mm_insert_ps(value, tmp, 0x00);
2793 return _mm_insert_ps(value, tmp, 0x10);
2795 return _mm_insert_ps(value, tmp, 0x20);
2797 return _mm_insert_ps(value, tmp, 0x30);
2802 #elif defined(NLIB_NEON) 2805 return F128::SetFloatToLane<0>(value, v);
2807 return F128::SetFloatToLane<1>(value, v);
2809 return F128::SetFloatToLane<2>(value, v);
2811 return F128::SetFloatToLane<3>(value, v);
2820 ret.vec.ps[0][0] = v;
2823 ret.vec.ps[0][1] = v;
2826 ret.vec.ps[1][0] = v;
2829 ret.vec.ps[1][1] = v;
2836 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 2839 template<
bool IsHighA,
bool IsHighB>
2840 float32x2_t F64Merge(float32x2_t a, float32x2_t b)
NLIB_NOEXCEPT;
2845 return vtrn1_f32(a, b);
2847 return vtrn_f32(a, b).val[0];
2854 return vtrn1_f32(vrev64_f32(a), b);
2856 return vtrn_f32(vrev64_f32(a), b).val[0];
2863 return vtrn1_f32(a, vrev64_f32(b));
2865 return vtrn_f32(a, vrev64_f32(b)).val[0];
2872 return vtrn2_f32(a, b);
2874 return vtrn_f32(a, b).val[1];
2883 return vget_low_f32(value);
2888 return vget_high_f32(value);
2891 template<
int X0,
int X1>
2892 struct F128SwizzleHelper2 {
2894 float32x2_t x0 = F128SwizzleGet64<X0 / 2>(value);
2895 float32x2_t x1 = F128SwizzleGet64<X1 / 2>(value);
2896 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
2901 struct F128SwizzleHelper2<X, X> {
2903 float32x2_t x = F128SwizzleGet64<X / 2>(value);
2904 return vdup_lane_f32(x, (X & 1));
2909 struct F128SwizzleHelper2<0, 1> {
2911 return vget_low_f32(value);
2916 struct F128SwizzleHelper2<0, 2> {
2919 return vget_low_f32(vuzp1q_f32(value, value));
2921 float32x2_t lo = vget_low_f32(value);
2922 float32x2_t hi = vget_high_f32(value);
2923 return vzip_f32(lo, hi).val[0];
2929 struct F128SwizzleHelper2<0, 3> {
2931 float32x2_t lo = vget_low_f32(value);
2932 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2934 return vzip1_f32(lo, hi);
2936 return vzip_f32(lo, hi).val[0];
2942 struct F128SwizzleHelper2<1, 0> {
2944 return vrev64_f32(vget_low_f32(value));
2949 struct F128SwizzleHelper2<1, 2> {
2951 float32x2_t lo = vget_low_f32(value);
2952 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2954 return vzip2_f32(lo, hi);
2956 return vzip_f32(lo, hi).val[1];
2962 struct F128SwizzleHelper2<1, 3> {
2965 return vget_low_f32(vuzp2q_f32(value, value));
2967 float32x2_t lo = vget_low_f32(value);
2968 float32x2_t hi = vget_high_f32(value);
2969 return vzip_f32(lo, hi).val[1];
2975 struct F128SwizzleHelper2<2, 0> {
2978 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 0));
2980 float32x2_t lo = vget_low_f32(value);
2981 float32x2_t hi = vget_high_f32(value);
2982 return vzip_f32(hi, lo).val[0];
2988 struct F128SwizzleHelper2<2, 1> {
2991 return vget_high_f32(vcopyq_laneq_f32(value, 3, value, 1));
2993 float32x2_t lo = vget_low_f32(value);
2994 float32x2_t hi = vrev64_f32(vget_high_f32(value));
2995 return vzip_f32(hi, lo).val[1];
3001 struct F128SwizzleHelper2<2, 3> {
3003 return vget_high_f32(value);
3008 struct F128SwizzleHelper2<3, 0> {
3010 float32x2_t lo = vget_low_f32(value);
3011 float32x2_t hi = vrev64_f32(vget_high_f32(value));
3013 return vzip1_f32(hi, lo);
3015 return vzip_f32(hi, lo).val[0];
3021 struct F128SwizzleHelper2<3, 1> {
3023 float32x2_t lo = vget_low_f32(value);
3024 float32x2_t hi = vget_high_f32(value);
3026 return vzip2_f32(hi, lo);
3028 return vzip_f32(hi, lo).val[1];
3034 struct F128SwizzleHelper2<3, 2> {
3036 return vrev64_f32(vget_high_f32(value));
3040 template<
int V0,
int V1,
int V2,
int V3>
3041 struct F128SwizzleHelper {
3043 return vcombine_f32(detail::F128SwizzleHelper2<V0, V1>::Swizzle(value),
3044 detail::F128SwizzleHelper2<V2, V3>::Swizzle(value));
3048 template<
int Vx,
int Vy>
3049 struct F128SwizzleHelper<Vx, Vy, Vx, Vy> {
3051 float32x2_t tmp = detail::F128SwizzleHelper2<Vx, Vy>::Swizzle(value);
3052 return vcombine_f32(tmp, tmp);
3057 struct F128SwizzleHelper<V, V, V, V> {
3064 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3067 template<
int X0,
int X1>
3068 struct F128SwizzleHelper {
3073 struct F128SwizzleHelper<0, 0> {
3076 return __PS_MERGE00(v0, v0);
3081 struct F128SwizzleHelper<0, 1> {
3089 struct F128SwizzleHelper<0, 2> {
3091 return __PS_MERGE00(v0, v1);
3096 struct F128SwizzleHelper<0, 3> {
3098 return __PS_MERGE01(v0, v1);
3103 struct F128SwizzleHelper<1, 0> {
3106 return __PS_MERGE10(v0, v0);
3111 struct F128SwizzleHelper<1, 1> {
3114 return __PS_MERGE11(v0, v0);
3119 struct F128SwizzleHelper<1, 2> {
3121 return __PS_MERGE10(v0, v1);
3126 struct F128SwizzleHelper<1, 3> {
3128 return __PS_MERGE11(v0, v1);
3133 struct F128SwizzleHelper<2, 0> {
3135 return __PS_MERGE00(v1, v0);
3140 struct F128SwizzleHelper<2, 1> {
3142 return __PS_MERGE01(v1, v0);
3147 struct F128SwizzleHelper<2, 2> {
3150 return __PS_MERGE00(v1, v1);
3155 struct F128SwizzleHelper<2, 3> {
3163 struct F128SwizzleHelper<3, 0> {
3165 return __PS_MERGE10(v1, v0);
3170 struct F128SwizzleHelper<3, 1> {
3172 return __PS_MERGE11(v1, v0);
3177 struct F128SwizzleHelper<3, 2> {
3180 return __PS_MERGE10(v1, v1);
3185 struct F128SwizzleHelper<3, 3> {
3188 return __PS_MERGE11(v1, v1);
3195 template<
int V0,
int V1,
int V2,
int V3>
3202 #if defined(NLIB_F128_SIMD_NOUSE) 3204 ret.vec.v[0] = value.vec.v[V0 != -1 ? V0 : 0];
3205 ret.vec.v[1] = value.vec.v[V1 != -1 ? V1 : 1];
3206 ret.vec.v[2] = value.vec.v[V2 != -1 ? V2 : 2];
3207 ret.vec.v[3] = value.vec.v[V3 != -1 ? V3 : 3];
3209 #elif __has_builtin(__builtin_shufflevector) 3210 return __builtin_shufflevector(value, value, V0, V1, V2, V3);
3211 #elif defined(NLIB_SSE41) 3212 return _mm_shuffle_ps(
3214 _MM_SHUFFLE(V3 != -1 ? V3 : 3, V2 != -1 ? V2 : 2, V1 != -1 ? V1 : 1, V0 != -1 ? V0 : 0));
3215 #elif defined(NLIB_NEON) 3216 return detail::F128SwizzleHelper < V0 != -1 ? V0 : 0, V1 != -1 ? V1 : 1, V2 != -1 ? V2 : 2,
3217 V3 != -1 ? V3 : 3 > ::Swizzle(value);
3220 ret.vec.ps[0] = detail::F128SwizzleHelper<(V0 != -1 ? V0 : 0), (V1 != -1 ? V1 : 1)>::Swizzle(
3221 value.vec.ps[0], value.vec.ps[1]);
3222 ret.vec.ps[1] = detail::F128SwizzleHelper<(V2 != -1 ? V2 : 2), (V3 != -1 ? V3 : 3)>::Swizzle(
3223 value.vec.ps[0], value.vec.ps[1]);
3228 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3234 return vzip1q_f32(value, value);
3236 return vzipq_f32(value, value).val[0];
3243 return vtrn1q_f32(value, value);
3245 return vtrnq_f32(value, value).val[0];
3257 return vuzp1q_f32(value, value);
3259 return vuzpq_f32(value, value).val[0];
3265 return vrev64q_f32(value);
3271 return vtrn2q_f32(value, value);
3273 return vtrnq_f32(value, value).val[1];
3279 uint32x4_t ival = vreinterpretq_u32_f32(value);
3280 uint32x4_t rotated = vextq_u32(ival, ival, 1);
3281 return vreinterpretq_f32_u32(rotated);
3287 return vuzp2q_f32(value, value);
3289 return vuzpq_f32(value, value).val[1];
3296 return vzip2q_f32(value, value);
3298 return vzipq_f32(value, value).val[1];
3304 uint32x4_t ival = vreinterpretq_u32_f32(value);
3305 uint32x4_t rotated = vextq_u32(ival, ival, 2);
3306 return vreinterpretq_f32_u32(rotated);
3311 uint32x4_t ival = vreinterpretq_u32_f32(value);
3312 uint32x4_t rotated = vextq_u32(ival, ival, 3);
3313 return vreinterpretq_f32_u32(rotated);
3319 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3320 template<
bool UseBlend,
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3321 struct F128PermuteHelper2 {
3323 f128 as = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(a);
3324 f128 bs = F128::Swizzle<V0 & 3, V1 & 3, V2 & 3, V3 & 3>(b);
3325 return _mm_blend_ps(
3327 (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) | ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3331 template<
bool UseShuffle,
int V0,
int V1,
int V2,
int V3>
3332 struct F128PermuteHelper2<true, UseShuffle, V0, V1, V2, V3> {
3334 return _mm_blend_ps(
3336 (((V0 & 4) ? 1 : 0) | ((V1 & 4) ? 2 : 0) | ((V2 & 4) ? 4 : 0) | ((V3 & 4) ? 8 : 0)));
3340 template<
int V0,
int V1,
int V2,
int V3>
3341 struct F128PermuteHelper2<false, true, V0, V1, V2, V3> {
3343 return _mm_shuffle_ps(V0 < 4 ? a : b, V0 < 4 ? b : a,
3344 _MM_SHUFFLE((V3 & 3), (V2 & 3), (V1 & 3), (V0 & 3)));
3349 struct F128PermuteHelper2<false, false, 1, 2, 3, 4> {
3351 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 4);
3352 return _mm_castsi128_ps(tmp);
3357 struct F128PermuteHelper2<false, false, 3, 4, 5, 6> {
3359 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 12);
3360 return _mm_castsi128_ps(tmp);
3365 struct F128PermuteHelper2<false, false, 5, 6, 7, 0> {
3367 __m128i tmp = _mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), 20);
3368 return _mm_castsi128_ps(tmp);
3373 struct F128PermuteHelper2<false, false, V, 1, 2, 3> {
3376 return _mm_insert_ps(a, b, ((V - 4) << 6) | (0 << 4));
3381 struct F128PermuteHelper2<false, false, 0, V, 2, 3> {
3384 return _mm_insert_ps(a, b, ((V - 4) << 6) | (1 << 4));
3389 struct F128PermuteHelper2<false, false, 0, 1, V, 3> {
3392 return _mm_insert_ps(a, b, ((V - 4) << 6) | (2 << 4));
3397 struct F128PermuteHelper2<false, false, 0, 1, 2, V> {
3400 return _mm_insert_ps(a, b, ((V - 4) << 6) | (3 << 4));
3405 struct F128PermuteHelper2<false, false, V, 5, 6, 7> {
3408 return _mm_insert_ps(b, a, (V << 6) | (0 << 4));
3413 struct F128PermuteHelper2<false, false, 4, V, 6, 7> {
3416 return _mm_insert_ps(b, a, (V << 6) | (1 << 4));
3421 struct F128PermuteHelper2<false, false, 4, 5, V, 7> {
3424 return _mm_insert_ps(b, a, (V << 6) | (2 << 4));
3429 struct F128PermuteHelper2<false, false, 4, 5, 6, V> {
3432 return _mm_insert_ps(b, a, (V << 6) | (3 << 4));
3436 template<
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3437 struct F128PermuteHelper {
3439 return F128PermuteHelper2<
3440 ((V0 % 4 == 0) && (V1 % 4 == 1) && (V2 % 4 == 2) && (V3 % 4 == 3)),
3441 ((V0 < 4 && V1 < 4 && V2 >= 4 && V3 >= 4) || (V0 >= 4 && V1 >= 4 && V2 < 4 && V3 < 4)),
3442 V0, V1, V2, V3>::Permute(a, b);
3446 #elif defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 3454 return vget_low_f32(a);
3459 return vget_high_f32(a);
3464 return vget_low_f32(b);
3469 return vget_high_f32(b);
3472 template<
int X0,
int X1>
3473 struct F128PermuteHelper2 {
3475 float32x2_t x0 = F128PermuteGet64<X0 / 2>(a, b);
3476 float32x2_t x1 = F128PermuteGet64<X1 / 2>(a, b);
3477 return F64Merge<(X0 & 1), (X1 & 1)>(x0, x1);
3482 struct F128PermuteHelper2<X, X> {
3484 float32x2_t x = F128PermuteGet64<X / 2>(a, b);
3485 return vdup_lane_f32(x, (X & 1));
3489 template<
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3490 struct F128PermuteHelper {
3492 return vcombine_f32(F128PermuteHelper2<V0, V1>::Permute(a, b),
3493 F128PermuteHelper2<V2, V3>::Permute(a, b));
3498 struct F128PermuteHelper<false, false, 1, 2, 3, 4> {
3500 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 1);
3501 return vreinterpretq_f32_s32(tmp);
3506 struct F128PermuteHelper<false, false, 3, 4, 5, 6> {
3508 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b), 3);
3509 return vreinterpretq_f32_s32(tmp);
3514 struct F128PermuteHelper<false, false, 5, 6, 7, 0> {
3516 int32x4_t tmp = vextq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a), 1);
3517 return vreinterpretq_f32_s32(tmp);
3520 #elif defined(CAFE) && !defined(NLIB_F128_SIMD_NOUSE) 3521 template<
int R0,
int R1,
int VAR0,
int VAR1>
3522 struct F128PermuteHelper2 {
3523 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT;
3526 template<
int R0,
int R1>
3527 struct F128PermuteHelper2<R0, R1, 0, 0> {
3528 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3529 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v0);
3533 template<
int R0,
int R1>
3534 struct F128PermuteHelper2<R0, R1, 0, 1> {
3535 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3536 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v1);
3540 template<
int R0,
int R1>
3541 struct F128PermuteHelper2<R0, R1, 0, 2> {
3542 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3543 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v2);
3547 template<
int R0,
int R1>
3548 struct F128PermuteHelper2<R0, R1, 0, 3> {
3549 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3550 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v0, v3);
3554 template<
int R0,
int R1>
3555 struct F128PermuteHelper2<R0, R1, 1, 0> {
3556 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3557 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v0);
3561 template<
int R0,
int R1>
3562 struct F128PermuteHelper2<R0, R1, 1, 1> {
3563 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3564 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v1);
3568 template<
int R0,
int R1>
3569 struct F128PermuteHelper2<R0, R1, 1, 2> {
3570 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3571 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v2);
3575 template<
int R0,
int R1>
3576 struct F128PermuteHelper2<R0, R1, 1, 3> {
3577 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3578 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v1, v3);
3582 template<
int R0,
int R1>
3583 struct F128PermuteHelper2<R0, R1, 2, 0> {
3584 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3585 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v0);
3589 template<
int R0,
int R1>
3590 struct F128PermuteHelper2<R0, R1, 2, 1> {
3591 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3592 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v1);
3596 template<
int R0,
int R1>
3597 struct F128PermuteHelper2<R0, R1, 2, 2> {
3598 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3599 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v2);
3603 template<
int R0,
int R1>
3604 struct F128PermuteHelper2<R0, R1, 2, 3> {
3605 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3606 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v2, v3);
3610 template<
int R0,
int R1>
3611 struct F128PermuteHelper2<R0, R1, 3, 0> {
3612 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3613 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v0);
3617 template<
int R0,
int R1>
3618 struct F128PermuteHelper2<R0, R1, 3, 1> {
3619 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3620 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v1);
3624 template<
int R0,
int R1>
3625 struct F128PermuteHelper2<R0, R1, 3, 2> {
3626 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3627 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v2);
3631 template<
int R0,
int R1>
3632 struct F128PermuteHelper2<R0, R1, 3, 3> {
3633 static f32x2 Permute(f32x2 v0, f32x2 v1, f32x2 v2, f32x2 v3)
NLIB_NOEXCEPT {
3634 return detail::F128SwizzleHelper<R0, (2 + R1)>::Swizzle(v3, v3);
3638 template<
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3639 struct F128PermuteHelper {
3642 f32x2 x0 = a.vec.ps[0];
3643 f32x2 x1 = a.vec.ps[1];
3644 f32x2 x2 = b.vec.ps[0];
3645 f32x2 x3 = b.vec.ps[1];
3647 F128PermuteHelper2<(V0 & 1), (V1 & 1), (V0 / 2), (V1 / 2)>::Permute(x0, x1, x2, x3);
3649 F128PermuteHelper2<(V2 & 1), (V3 & 1), (V2 / 2), (V3 / 2)>::Permute(x0, x1, x2, x3);
3654 template<
bool IsAllA,
bool IsAllB,
int V0,
int V1,
int V2,
int V3>
3655 struct F128PermuteHelper {
3657 f128 ret = F128::SetValue(F128::GetFloatFromLane<V0 & 3>(V0 < 4 ? a : b),
3658 F128::GetFloatFromLane<V1 & 3>(V1 < 4 ? a : b),
3659 F128::GetFloatFromLane<V2 & 3>(V2 < 4 ? a : b),
3660 F128::GetFloatFromLane<V3 & 3>(V3 < 4 ? a : b));
3666 template<
int V0,
int V1,
int V2,
int V3>
3667 struct F128PermuteHelper<true, false, V0, V1, V2, V3> {
3670 return F128::Swizzle<V0, V1, V2, V3>(a);
3674 template<
int V0,
int V1,
int V2,
int V3>
3675 struct F128PermuteHelper<false, true, V0, V1, V2, V3> {
3678 return F128::Swizzle<(V0 - 4), (V1 - 4), (V2 - 4), (V3 - 4)>(b);
3682 #if defined(NLIB_SSE41) && !defined(NLIB_F128_SIMD_NOUSE) 3685 struct F128PermuteHelper<false, false, 0, 4, 1, 5> {
3687 return _mm_unpacklo_ps(a, b);
3691 struct F128PermuteHelper<false, false, 4, 0, 5, 1> {
3693 return _mm_unpacklo_ps(b, a);
3697 struct F128PermuteHelper<false, false, 2, 6, 3, 7> {
3699 return _mm_unpackhi_ps(a, b);
3703 struct F128PermuteHelper<false, false, 6, 2, 7, 3> {
3705 return _mm_unpackhi_ps(b, a);
3710 template<
int V0,
int V1,
int V2,
int V3>
3711 struct F128PermuteDontCareHelper {
3717 static const bool arg1 = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4);
3718 static const bool arg2 = (V0 > 3 && V1 > 3 && V2 > 3 && V3 > 3);
3719 return detail::F128PermuteHelper<arg1, arg2, V0, V1, V2, V3>::Permute(a, b);
3723 template<
int V1,
int V2,
int V3>
3724 struct F128PermuteDontCareHelper<8, V1, V2, V3> {
3729 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3730 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3734 template<
int V0,
int V2,
int V3>
3735 struct F128PermuteDontCareHelper<V0, 8, V2, V3> {
3740 static const int V1 = (V0 & 1) ? V0 : (V0 + 1);
3741 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3745 template<
int V0,
int V1,
int V3>
3746 struct F128PermuteDontCareHelper<V0, V1, 8, V3> {
3751 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3752 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3756 template<
int V0,
int V1,
int V2>
3757 struct F128PermuteDontCareHelper<V0, V1, V2, 8> {
3762 static const int V3 = (V2 & 1) ? V2 : (V2 + 1);
3763 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3767 template<
int V2,
int V3>
3768 struct F128PermuteDontCareHelper<8, 8, V2, V3> {
3772 static const int V0 = (V2 < 4) ? 0 : 4;
3773 return F128PermuteDontCareHelper<V0, V0 + 1, V2, V3>::Permute(a, b);
3777 template<
int V1,
int V2>
3778 struct F128PermuteDontCareHelper<8, V1, V2, 8> {
3782 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3783 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3784 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3788 template<
int V0,
int V1>
3789 struct F128PermuteDontCareHelper<V0, V1, 8, 8> {
3793 static const int V2 = (V1 < 4) ? 2 : 6;
3794 return F128PermuteDontCareHelper<V0, V1, V2, V2 + 1>::Permute(a, b);
3798 template<
int V0,
int V3>
3799 struct F128PermuteDontCareHelper<V0, 8, 8, V3> {
3803 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3804 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3805 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3809 template<
int V0,
int V2>
3810 struct F128PermuteDontCareHelper<V0, 8, V2, 8> {
3814 static const int V1 = (V0 & 1) ? V0 : V0 + 1;
3815 static const int V3 = (V2 & 1) ? V2 : V2 + 1;
3816 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3820 template<
int V1,
int V3>
3821 struct F128PermuteDontCareHelper<8, V1, 8, V3> {
3825 static const int V0 = (V1 & 1) ? V1 - 1 : V1;
3826 static const int V2 = (V3 & 1) ? V3 - 1 : V3;
3827 return F128PermuteDontCareHelper<V0, V1, V2, V3>::Permute(a, b);
3832 struct F128PermuteDontCareHelper<V, 8, 8, 8> {
3835 static const int V1 = ((V & 3) == 0) ? V + 1 : V;
3836 static const int V2 = ((V & 3) == 0) ? V + 2 : V;
3837 static const int V3 = ((V & 3) == 0) ? V + 3 : V;
3838 return F128PermuteDontCareHelper<V, V1, V2, V3>::Permute(a, b);
3843 struct F128PermuteDontCareHelper<8, V, 8, 8> {
3846 static const int V0 = ((V & 3) == 1) ? V - 1 : V;
3847 static const int V2 = ((V & 3) == 1) ? V + 1 : V;
3848 static const int V3 = ((V & 3) == 1) ? V + 2 : V;
3849 return F128PermuteDontCareHelper<V0, V, V2, V3>::Permute(a, b);
3854 struct F128PermuteDontCareHelper<8, 8, V, 8> {
3857 static const int V0 = ((V & 3) == 2) ? V - 2 : V;
3858 static const int V1 = ((V & 3) == 2) ? V - 1 : V;
3859 static const int V3 = ((V & 3) == 2) ? V + 2 : V;
3860 return F128PermuteDontCareHelper<V0, V1, V, V3>::Permute(a, b);
3865 struct F128PermuteDontCareHelper<8, 8, 8, V> {
3868 static const int V0 = ((V & 3) == 3) ? V - 3 : V;
3869 static const int V1 = ((V & 3) == 3) ? V - 2 : V;
3870 static const int V2 = ((V & 3) == 3) ? V - 1 : V;
3871 return F128PermuteDontCareHelper<V0, V1, V2, V>::Permute(a, b);
3876 struct F128PermuteDontCareHelper<8, 8, 8, 8> {
3885 template<
int V0,
int V1,
int V2,
int V3>
3888 #if __has_builtin(__builtin_shufflevector) && !defined(NLIB_F128_SIMD_NOUSE) 3889 return __builtin_shufflevector(a, b, (V0 != 8 ? V0 : -1), (V1 != 8 ? V1 : -1),
3890 (V2 != 8 ? V2 : -1), (V3 != 8 ? V3 : -1));
3892 return detail::F128PermuteDontCareHelper < V0 != -1 ? V0 : 8, V1 != -1 ? V1 : 8,
3893 V2 != -1 ? V2 : 8, V3 != -1 ? V3 : 8 > ::Permute(a, b);
3897 template<
bool SplatLane0,
bool SplatLane1,
bool SplatLane2,
bool SplatLane3>
3901 #if defined(NLIB_NEON) 3902 const int v0 = SplatLane0 ? (SplatLane1 ? 4 : 5) : 0;
3903 const int v1 = SplatLane1 ? (SplatLane0 ? 5 : 4) : 1;
3904 const int v2 = SplatLane2 ? (SplatLane3 ? 6 : 7) : 2;
3905 const int v3 = SplatLane3 ? (SplatLane2 ? 7 : 6) : 3;
3908 const int v0 = SplatLane0 ? 4 : 0;
3909 const int v1 = SplatLane1 ? 5 : 1;
3910 const int v2 = SplatLane2 ? 6 : 2;
3911 const int v3 = SplatLane3 ? 7 : 3;
3913 return F128::Permute<v0, v1, v2, v3>(value, splat);
3917 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 3919 ret.vec.v[0] = powf(2.f, value.vec.v[0]);
3920 ret.vec.v[1] = powf(2.f, value.vec.v[1]);
3921 ret.vec.v[2] = powf(2.f, value.vec.v[2]);
3922 ret.vec.v[3] = powf(2.f, value.vec.v[3]);
3925 i128 iround = F128::ConvertToI128Round(value);
3926 f128 fround = F128::ConvertFromI128(iround);
3927 f128 x = F128::Sub(value, fround);
3928 f128 xx = F128::Mult(x, x);
3930 f128 P = F128::LoadA16(F128::exp2_P_);
3931 f128 Q = F128::LoadA16(F128::exp2_Q_);
3939 px = F128::MultAdd(px, xx, F128::SetValue<2>(P,
each_select32));
3940 px = F128::Mult(x, px);
3946 qx = F128::MultAdd(qx, xx, F128::SetValue<1>(Q,
each_select32));
3948 x = F128::Div(px, F128::Sub(qx, px));
3952 iround = I128::Add32(iround, I128::SetValue(127,
each_int32));
3953 iround = I128::ShiftLeftLogical32(iround, 23);
3954 x = F128::Mult(x, F128::CastFromI128(iround));
3963 static const float log2e = 1.44269504088896340736f;
3964 return Exp2(F128::Mult(log2e, value));
3968 static const float log2e = 1.44269504088896340736f;
3970 f128 v0 = F128::MultAdd(log2e, value, neg_one);
3971 f128 v1 = F128::MultSub(log2e, value, neg_one);
3974 return F128::Sub(e0, e1);
3978 static const float log2e = 1.44269504088896340736f;
3980 f128 v0 = F128::MultAdd(log2e, value, neg_one);
3981 f128 v1 = F128::MultSub(log2e, value, neg_one);
3984 return F128::Add(e0, e1);
3989 f128 cvalue = F128::LoadA16(tanh_cvalue_);
3993 e = F128::MultAdd(half, e, half);
3995 return F128::Sub(F128::SetValue<1>(cvalue,
each_select32), e);
3999 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 4001 ret.vec.v[0] = tanf(value.vec.v[0]);
4002 ret.vec.v[1] = tanf(value.vec.v[1]);
4003 ret.vec.v[2] = tanf(value.vec.v[2]);
4004 ret.vec.v[3] = tanf(value.vec.v[3]);
4008 f128 C = F128::LoadA16(&F128::tan_c_[0]);
4014 i128 t0 = I128::And(F128::ConvertToI128Round(g), I128::SetValue(1U,
each_uint32));
4015 i128 cmp = I128::CmpEq32(t0, I128::SetZero());
4016 nearx_axis = F128::CastFromI128(cmp);
4025 f128 P = F128::LoadA16(&F128::tan_p_[0]);
4026 f128 Q = F128::LoadA16(&F128::tan_q_[0]);
4028 f128 ff = F128::Mult(f, f);
4032 p = F128::MultAdd(p, ff, F128::SetValue<0>(P,
each_select32));
4033 p = F128::MultAdd(p, ff, one);
4034 p = F128::Mult(f, p);
4037 q = F128::MultAdd(q, ff, F128::SetValue<1>(Q,
each_select32));
4038 q = F128::MultAdd(q, ff, F128::SetValue<0>(Q,
each_select32));
4039 q = F128::MultAdd(q, ff, one);
4041 p = F128::Select(near_axis, f, p);
4042 q = F128::Select(near_axis, one, q);
4044 f128 r0 = F128::Div(p, q);
4045 f128 r1 = F128::Negate(F128::Recp(r0));
4047 return F128::Select(nearx_axis, r0, r1);
4052 #if defined(NLIB_F128_SIMD_NOUSE) || defined(CAFE) 4053 static const float scale = 1.4426950408889634f;
4055 ret.vec.v[0] = logf(value.vec.v[0]);
4056 ret.vec.v[1] = logf(value.vec.v[1]);
4057 ret.vec.v[2] = logf(value.vec.v[2]);
4058 ret.vec.v[3] = logf(value.vec.v[3]);
4059 return F128::Mult(scale, ret);
4063 x = F128::Or(F128::SetValue(127U << 23,
each_uint32), x);
4064 i128 e = I128::And(I128::SetValue(0x7F800000U,
each_uint32), F128::CastToI128(value));
4065 e = I128::ShiftRightLogical32(e, 23);
4066 e = I128::Sub32(e, I128::SetValue(127U,
each_uint32));
4068 x = F128::Sub(x, F128::SetOne());
4069 f128 z = F128::Mult(x, x);
4072 f128 pq0 = F128::LoadA16(&F128::log2_PQ_[0]);
4073 f128 pq1 = F128::LoadA16(&F128::log2_PQ_[4]);
4074 f128 pq2 = F128::LoadA16(&F128::log2_PQ_[8]);
4077 p = F128::MultAdd(p, x, F128::SetValue<1>(pq0,
each_select32));
4078 p = F128::MultAdd(p, x, F128::SetValue<2>(pq0,
each_select32));
4079 p = F128::MultAdd(p, x, F128::SetValue<3>(pq0,
each_select32));
4080 p = F128::MultAdd(p, x, F128::SetValue<0>(pq1,
each_select32));
4081 p = F128::MultAdd(p, x, F128::SetValue<1>(pq1,
each_select32));
4084 q = F128::MultAdd(q, x, F128::SetValue<3>(pq1,
each_select32));
4085 q = F128::MultAdd(q, x, F128::SetValue<0>(pq2,
each_select32));
4086 q = F128::MultAdd(q, x, F128::SetValue<1>(pq2,
each_select32));
4087 q = F128::MultAdd(q, x, F128::SetValue<2>(pq2,
each_select32));
4089 y = F128::Mult(z, p);
4090 y = F128::Div(y, q);
4091 y = F128::MultAdd(x, y, F128::Mult(-0.5f, z));
4097 result = F128::Mult(y, log2ea);
4098 result = F128::MultAdd(log2ea, x, result);
4099 result = F128::Add(result, y);
4100 result = F128::Add(result, x);
4101 result = F128::Add(result, F128::ConvertFromI128(e));
4105 f128 nan_inf = F128::LoadA16(reinterpret_cast<const float*>(F128::nan_inf_));
4108 f128 is_nan = F128::IsNaN(value);
4110 result = F128::Select(is_nan, nan, result);
4112 f128 is_inf = F128::IsInfinite(value);
4113 f128 is_pos = F128::CmpGtZero(value);
4117 f128 is_pos_inf = F128::And(is_inf, is_pos);
4118 result = F128::Select(is_pos_inf, inf, result);
4122 f128 is_zero = F128::CmpEqZero(value);
4123 result = F128::Select(is_zero, neg_inf, result);
4127 f128 is_neg = F128::CmpLtZero(value);
4128 result = F128::Select(is_neg, neg_nan, result);
4138 #ifdef NLIB_F128_SIMD_NOUSE 4140 ret.vec.v[0] = logf(value.vec.v[0]);
4141 ret.vec.v[1] = logf(value.vec.v[1]);
4142 ret.vec.v[2] = logf(value.vec.v[2]);
4143 ret.vec.v[3] = logf(value.vec.v[3]);
4146 f128 x = F128::Log2(value);
4147 static const float recp_log2e = 0.6931471805597018f;
4148 return F128::Mult(recp_log2e, x);
4154 #endif // NLIB_DOXYGEN 4165 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4178 SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
float m12,
4179 float m13,
float m20,
float m21,
float m22,
float m23,
float m30,
float m31,
4187 inline SimdMatrix::SimdMatrix(
float m00,
float m01,
float m02,
float m03,
float m10,
float m11,
4188 float m12,
float m13,
float m20,
float m21,
float m22,
float m23,
4190 r[0] = F128::SetValue(m00, m01, m02, m03);
4191 r[1] = F128::SetValue(m10, m11, m12, m13);
4192 r[2] = F128::SetValue(m20, m21, m22, m23);
4193 r[3] = F128::SetValue(m30, m31, m32, m33);
4197 uintptr_t algn =
reinterpret_cast<uintptr_t
>(p) & 15;
4198 NLIB_ASSERT((algn & 3) == 0);
4199 switch (algn >> 2) {
4201 r[0] = F128::LoadA16(p);
4202 r[1] = F128::LoadA16(p + 4);
4203 r[2] = F128::LoadA16(p + 8);
4204 r[3] = F128::LoadA16(p + 12);
4207 r[0] = F128::LoadA4(p);
4208 r[1] = F128::LoadA4(p + 4);
4209 r[2] = F128::LoadA4(p + 8);
4210 r[3] = F128::LoadA4(p + 12);
4213 r[0] = F128::LoadA8(p);
4214 r[1] = F128::LoadA8(p + 4);
4215 r[2] = F128::LoadA8(p + 8);
4216 r[3] = F128::LoadA8(p + 12);
4224 #if !defined(NLIB_SIMD) || defined(NLIB_F128_SIMD_NOUSE) 4230 #if defined(NLIB_SSE41) || defined(NLIB_F128_SIMD_NOUSE) 4231 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4233 f128 tmp0 = F128::Permute<0, 1, 4, 5>(row0, row1); \ 4234 f128 tmp2 = F128::Permute<2, 3, 6, 7>(row0, row1); \ 4235 f128 tmp1 = F128::Permute<0, 1, 4, 5>(row2, row3); \ 4236 f128 tmp3 = F128::Permute<2, 3, 6, 7>(row2, row3); \ 4237 (row0) = F128::Permute<0, 2, 4, 6>(tmp0, tmp1); \ 4238 (row1) = F128::Permute<1, 3, 5, 7>(tmp0, tmp1); \ 4239 (row2) = F128::Permute<0, 2, 4, 6>(tmp2, tmp3); \ 4240 (row3) = F128::Permute<1, 3, 5, 7>(tmp2, tmp3); \ 4242 #elif defined(NLIB_NEON) 4244 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4246 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4247 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4248 uint64x2_t row0_, row1_, row2_, row3_; \ 4249 row0_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4250 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4251 row0 = vreinterpretq_f32_u64(row0_); \ 4252 row1_ = vtrn1q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4253 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4254 row1 = vreinterpretq_f32_u64(row1_); \ 4255 row2_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[0]), \ 4256 vreinterpretq_u64_f32(trn_f1_.val[0])); \ 4257 row2 = vreinterpretq_f32_u64(row2_); \ 4258 row3_ = vtrn2q_u64(vreinterpretq_u64_f32(trn_f0_.val[1]), \ 4259 vreinterpretq_u64_f32(trn_f1_.val[1])); \ 4260 row3 = vreinterpretq_f32_u64(row3_); \ 4263 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4265 float32x4x2_t trn_f0_ = vtrnq_f32(row0, row1); \ 4266 float32x4x2_t trn_f1_ = vtrnq_f32(row2, row3); \ 4267 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0])); \ 4268 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1])); \ 4269 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0])); \ 4270 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1])); \ 4274 #define NLIB_F128_TRANSPOSE(row0, row1, row2, row3) \ 4277 tmp0 = __PS_MERGE00(row0.vec.ps[0], row1.vec.ps[0]); \ 4278 tmp1 = __PS_MERGE11(row0.vec.ps[0], row1.vec.ps[0]); \ 4279 row0.vec.ps[0] = tmp0; \ 4280 row1.vec.ps[0] = tmp1; \ 4281 tmp0 = __PS_MERGE00(row2.vec.ps[1], row3.vec.ps[1]); \ 4282 tmp1 = __PS_MERGE11(row2.vec.ps[1], row3.vec.ps[1]); \ 4283 row2.vec.ps[1] = tmp0; \ 4284 row3.vec.ps[1] = tmp1; \ 4285 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4286 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4287 row0.vec.ps[1] = row2.vec.ps[0]; \ 4288 row1.vec.ps[1] = row3.vec.ps[0]; \ 4289 row2.vec.ps[0] = tmp0; \ 4290 row3.vec.ps[0] = tmp1; \ 4291 tmp0 = __PS_MERGE00(row0.vec.ps[1], row1.vec.ps[1]); \ 4292 tmp1 = __PS_MERGE11(row0.vec.ps[1], row1.vec.ps[1]); \ 4293 row0.vec.ps[1] = tmp0; \ 4294 row1.vec.ps[1] = tmp1; \ 4319 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4327 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4335 #if !defined(NLIB_DOXYGEN) && !defined(NN_PLATFORM_CTR) 4346 #endif // INCLUDE_NN_NLIB_SIMD_SIMDFLOAT_H_ float x
The x-coordinate of the 3D vector.
SimdMatrix()
Instantiates the object with default parameters (default constructor).
The class with the collection of functions that handle 4x4 matrices.
Class representing the view frustum.
The class with the collection of functions that handle quaternions.
f128arg SimdVectorArg
f128arg is defined using typedef.
The type for two SIMD registers for 128-bit, single-precision, floating-point numbers.
float x
The x-coordinate for the 4D vector.
float y
The y-coordinate of the 4D vector.
Implements the class and functions for SIMD computations on integers.
SimdMatrix(f128arg r0, f128arg r1, f128arg r2, f128arg_ex r3) noexcept
Sets up the matrix from the parameters.
The tag for representing the selection of a lane divided into 32-bit units with an empty structure...
f128arg SimdSphereArg
f128arg is defined using typedef.
Class for representing oriented bounding boxes (OBB). This class has data members to hold the center ...
static f128 ShiftRight(f128arg a, f128arg b) noexcept
Sets the elements of b in shifted order to the portion of a that becomes empty when a is shifted to t...
The tag for representing a single-precision floating-point number with an empty structure.
The class with the collection of functions that determine containment relations.
The class with the collection of static member functions that handle spheres in three-dimensional spa...
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
f128arg SimdQuaternionArg
f128arg is defined using typedef.
nlib_i128_t i128
nlib_i128_t is defined using typedef.
The class with the collection of functions that handle planes in three-dimensional space...
f128arg SimdPlaneArg
f128arg is defined using typedef.
The class with the collection of functions that perform calculations on three-dimensional vectors...
static f128 RotateLeft(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the left by the amount of N...
The class with the collection of functions that perform square-of-distance calculations.
The type for reading and writing four-dimensional vectors in memory. Keeps float-type x...
const f128 f128arg
const f128 or const f128& is defined using typedef.
The structure for keeping a 4x4 matrix.
float z
The z-coordinate of the 4D vector.
nlib_f128x2_t f128x2
nlib_f128x2_t is defined using typedef.
f128 SimdSphere
f128 is defined using typedef. Used when handling spheres.
The class for single-precision floating point SIMD computations using128-bit registers (MM0-XMM15 for...
constexpr const each_uint32_tag each_uint32
The tag for representing an unsigned 32-bit integer with an each_uint32_tag-type constant object...
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
#define NLIB_CEXPR
Defines constexpr if it is available for use. If not, holds an empty string.
A file that contains the configuration information for each development environment.
The class with the collection of functions that perform calculations on four-dimensional vectors...
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
#define NLIB_ALIGNAS(x)
Defines alignas(x) or the equivalent.
constexpr const each_int8_tag each_int8
The tag for representing a signed 8-bit integer with an each_int8_tag-type constant object...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
The tag for representing an unsigned 32-bit integer with an empty structure.
static f128 RotateRight(f128arg value) noexcept
Rotates four single-precision floating-point numbers to the right by the amount of N...
float y
The y-coordinate of the 3D vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
float z
The z-coordinate of the 3D vector.
Class for representing axis-aligned bounding boxes (AABB). The class has data members to hold the min...
constexpr const each_int32_tag each_int32
The tag for representing a signed 32-bit integer with an each_int32_tag-type constant object...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
float w
The w-coordinate of the 4D vector.
The class with the collection of functions that determine intersections.
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
__m128 nlib_f128_t
The type for a SIMD register for 128-bit, single-precision, floating-point numbers.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...