16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ 26 template<
typename MyVector4>
28 template<
typename MyVector4>
31 return F128::IsAllMaskTrue(F128::CmpEq(vec1, vec2));
34 return F128::IsAllMaskTrue(F128::CmpLt(vec1, vec2));
37 return F128::IsAllMaskTrue(F128::CmpLe(vec1, vec2));
40 return F128::IsAllMaskTrue(F128::CmpGt(vec1, vec2));
43 return F128::IsAllMaskTrue(F128::CmpGe(vec1, vec2));
46 return F128::IsAllMaskTrue(F128::CmpNe(vec1, vec2));
48 static bool __vectorcall
50 return F128::IsAllMaskTrue(F128::CmpNearEq(vec1, vec2, eps));
53 return !F128::IsAllMaskFalse(F128::IsNaN(vec));
56 return !F128::IsAllMaskFalse(F128::IsInfinite(vec));
59 return F128::IsAllMaskTrue(F128::InBound(vec, bounds));
63 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
65 static f128 __vectorcall
67 static f128 __vectorcall
93 #define NLIB_M(tp) inline tp __vectorcall 94 #define NLIB_MH(tp) inline tp __vectorcall 96 template<
typename MyVector4>
104 return F128::LoadA4(reinterpret_cast<const float*>(&p->x));
107 template<
typename MyVector4>
115 F128::StoreA4(reinterpret_cast<float*>(&p->x), vec);
120 #ifdef NLIB_F128_SIMD_NOUSE 122 ret.vec.v[0] = ret.vec.v[1] = ret.vec.v[2] = ret.vec.v[3] =
123 vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
124 vec1.vec.v[2] * vec2.vec.v[2] + vec1.vec.v[3] * vec2.vec.v[3];
126 #elif defined(NLIB_SSE41) 127 return _mm_dp_ps(vec1, vec2, 0xFF);
128 #elif defined(NLIB_NEON) 130 float32x4_t tmp = vmulq_f32(vec1, vec2);
131 tmp = vpaddq_f32(tmp, tmp);
132 tmp = vpaddq_f32(tmp, tmp);
135 float32x4_t tmp = vmulq_f32(vec1, vec2);
136 float32x2_t v1 = vget_low_f32(tmp);
137 float32x2_t v2 = vget_high_f32(tmp);
138 v1 = vpadd_f32(v1, v1);
139 v2 = vpadd_f32(v2, v2);
140 v1 = vadd_f32(v1, v2);
141 return vcombine_f32(v1, v1);
144 f128 tmp = F128::Mult(vec1, vec2);
145 f32x2 val = __PS_ADD(tmp.vec.ps[0], tmp.vec.ps[1]);
146 val = __PS_SUM0(val, val, val);
148 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
153 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
156 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41) 157 return _mm_dp_ps(vec1, vec2, (0xF0 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) |
158 (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
160 return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
166 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 168 float32x4_t zero = vdupq_n_f32(0);
169 float32x4_t tmp0 = vmulq_f32(a, b0);
170 float32x4_t tmp1 = vmulq_f32(a, b1);
171 tmp0 = vpaddq_f32(tmp0, tmp1);
172 tmp0 = vpaddq_f32(tmp0, zero);
175 float32x4x2_t trn = vtrnq_f32(b0, b1);
176 float32x2_t lo = vget_low_f32(a);
177 float32x2_t hi = vget_high_f32(a);
178 float32x4_t xxzz = vcombine_f32(vdup_lane_f32(lo, 0), vdup_lane_f32(hi, 0));
179 float32x4_t yyww = vcombine_f32(vdup_lane_f32(lo, 1), vdup_lane_f32(hi, 1));
180 float32x4_t tmp = vmulq_f32(trn.val[0], xxzz);
181 tmp = vmlaq_f32(tmp, yyww, trn.val[1]);
182 float32x2_t result = vadd_f32(vget_low_f32(tmp), vget_high_f32(tmp));
183 return vcombine_f32(result, vdup_n_f32(0.f));
186 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
187 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
188 return F128::Or(t0, t1);
195 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 197 float32x4_t zero = vdupq_n_f32(0);
198 float32x4_t tmp0 = vmulq_f32(a, b0);
199 float32x4_t tmp1 = vmulq_f32(a, b1);
200 float32x4_t tmp2 = vmulq_f32(a, b2);
201 tmp0 = vpaddq_f32(tmp0, tmp1);
202 tmp2 = vpaddq_f32(tmp2, zero);
203 tmp0 = vpaddq_f32(tmp0, tmp2);
206 f128 row0, row1, row2, row3;
207 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
208 float32x4x2_t trn_f1_ = vtrnq_f32(b2, vdupq_n_f32(0.f));
209 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
210 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
211 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
212 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
219 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
220 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
221 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
222 return F128::Or(F128::Or(t0, t1), t2);
229 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 231 float32x4_t tmp0 = vmulq_f32(a, b0);
232 float32x4_t tmp1 = vmulq_f32(a, b1);
233 float32x4_t tmp2 = vmulq_f32(a, b2);
234 float32x4_t tmp3 = vmulq_f32(a, b3);
235 tmp0 = vpaddq_f32(tmp0, tmp1);
236 tmp2 = vpaddq_f32(tmp2, tmp3);
237 tmp0 = vpaddq_f32(tmp0, tmp2);
240 f128 row0, row1, row2, row3;
241 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
242 float32x4x2_t trn_f1_ = vtrnq_f32(b2, b3);
243 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
244 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
245 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
246 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
253 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
254 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
255 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
256 f128 t3 = Vector4::DotEx<false, false, false, true>(a, b3);
257 return F128::Or(F128::Or(t0, t1), F128::Or(t2, t3));
263 f128 dot = Vector4::Dot(vec, vec);
264 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 266 x = vrsqrteq_f32(dot);
267 x = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
268 f128 rsqrt = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
270 f128 rsqrt = F128::RecpSqrt(dot);
272 f128 inf = F128::SetInfinity();
273 f128 eqzero = F128::CmpEqZero(dot);
274 f128 eqinf = F128::CmpEq(dot, inf);
276 f128 nan = F128::SetNaN();
277 ret = F128::AndNot(eqzero, ret);
278 ret = F128::Select(eqinf, nan, ret);
287 return F128::Sqrt(Dot(vec, vec));
292 return F128::SqrtEst(Dot(vec, vec));
297 return F128::RecpSqrt(Dot(vec, vec));
302 return F128::RecpSqrtEst(Dot(vec, vec));
306 return F128::Mult(vec, RecpLengthEst(vec));
312 f128 ret = Dot(vec1_normalized, vec2_normalized);
313 ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
314 return F128::ArcCos(ret);
318 f128 s = Dot(vec, normal);
320 return F128::MultSub(s, normal, vec);
379 #endif // NLIB_DOXYGEN 384 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ static bool InBound(SimdVectorArg vec, SimdVectorArg bounds) noexcept
Evaluates whether the elements of vec are inside the bounds of bounds.
static bool CmpNearEq(SimdVectorArg vec1, SimdVectorArg vec2, SimdVectorArg eps) noexcept
Compares vec1 and vec2 to see whether they are nearly equal.
f128arg SimdVectorArg
f128arg is defined using typedef.
static bool IsNaN(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are NaN.
static bool CmpGe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or greater than the corresponding element...
static bool CmpGt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are larger than the corresponding elements of vec2...
f128 r[4]
Keeps each row of a 4x4 matrix.
The structure for keeping a 4x4 matrix.
static bool CmpNe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares vec1 and vec2 to see whether they are unequal.
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
The class with the collection of functions that perform calculations on four-dimensional vectors...
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
static bool CmpEq(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares two four-dimensional vectors to see if they are equal.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
static bool CmpLt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are smaller than the corresponding elements of vec2...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
static bool IsInfinite(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are positive infinity or negative infinity.
static bool CmpLe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or less than the corresponding elements o...
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...