16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ 26 template<
typename MyVector4>
28 template<
typename MyVector4>
31 return F128::IsAllMaskTrue(F128::CmpEq(vec1, vec2));
34 return F128::IsAllMaskTrue(F128::CmpLt(vec1, vec2));
37 return F128::IsAllMaskTrue(F128::CmpLe(vec1, vec2));
40 return F128::IsAllMaskTrue(F128::CmpGt(vec1, vec2));
43 return F128::IsAllMaskTrue(F128::CmpGe(vec1, vec2));
46 return F128::IsAllMaskTrue(F128::CmpNe(vec1, vec2));
50 return F128::IsAllMaskTrue(F128::CmpNearEq(vec1, vec2, eps));
53 return !F128::IsAllMaskFalse(F128::IsNaN(vec));
56 return !F128::IsAllMaskFalse(F128::IsInfinite(vec));
59 return F128::IsAllMaskTrue(F128::InBound(vec, bounds));
63 template<
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
93 #define NLIB_M(tp) inline tp __vectorcall 94 #define NLIB_MH(tp) inline tp __vectorcall 96 template<
typename MyVector4>
104 return F128::LoadA4(reinterpret_cast<const float*>(&p->x));
107 template<
typename MyVector4>
115 F128::StoreA4(reinterpret_cast<float*>(&p->x), vec);
120 #ifdef NLIB_F128_SIMD_NOUSE 122 ret.vec.v[0] = ret.vec.v[1] = ret.vec.v[2] = ret.vec.v[3] =
123 vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
124 vec1.vec.v[2] * vec2.vec.v[2] + vec1.vec.v[3] * vec2.vec.v[3];
126 #elif defined(NLIB_SSE41) 127 return _mm_dp_ps(vec1, vec2, 0xFF);
128 #elif defined(NLIB_NEON) 130 float32x4_t tmp = vmulq_f32(vec1, vec2);
131 tmp = vpaddq_f32(tmp, tmp);
132 tmp = vpaddq_f32(tmp, tmp);
135 float32x4_t tmp = vmulq_f32(vec1, vec2);
136 float32x2_t v1 = vget_low_f32(tmp);
137 float32x2_t v2 = vget_high_f32(tmp);
138 v1 = vpadd_f32(v1, v1);
139 v2 = vpadd_f32(v2, v2);
140 v1 = vadd_f32(v1, v2);
141 return vcombine_f32(v1, v1);
144 f128 tmp = F128::Mult(vec1, vec2);
145 f32x2 val = __PS_ADD(tmp.vec.ps[0], tmp.vec.ps[1]);
146 val = __PS_SUM0(val, val, val);
148 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
153 template<
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
156 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41) 159 (0xF0 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) | (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
161 return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
167 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 169 float32x4_t zero = vdupq_n_f32(0);
170 float32x4_t tmp0 = vmulq_f32(a, b0);
171 float32x4_t tmp1 = vmulq_f32(a, b1);
172 tmp0 = vpaddq_f32(tmp0, tmp1);
173 tmp0 = vpaddq_f32(tmp0, zero);
176 float32x4x2_t trn = vtrnq_f32(b0, b1);
177 float32x2_t lo = vget_low_f32(a);
178 float32x2_t hi = vget_high_f32(a);
179 float32x4_t xxzz = vcombine_f32(vdup_lane_f32(lo, 0), vdup_lane_f32(hi, 0));
180 float32x4_t yyww = vcombine_f32(vdup_lane_f32(lo, 1), vdup_lane_f32(hi, 1));
181 float32x4_t tmp = vmulq_f32(trn.val[0], xxzz);
182 tmp = vmlaq_f32(tmp, yyww, trn.val[1]);
183 float32x2_t result = vadd_f32(vget_low_f32(tmp), vget_high_f32(tmp));
184 return vcombine_f32(result, vdup_n_f32(0.f));
187 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
188 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
189 return F128::Or(t0, t1);
196 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 198 float32x4_t zero = vdupq_n_f32(0);
199 float32x4_t tmp0 = vmulq_f32(a, b0);
200 float32x4_t tmp1 = vmulq_f32(a, b1);
201 float32x4_t tmp2 = vmulq_f32(a, b2);
202 tmp0 = vpaddq_f32(tmp0, tmp1);
203 tmp2 = vpaddq_f32(tmp2, zero);
204 tmp0 = vpaddq_f32(tmp0, tmp2);
207 f128 row0, row1, row2, row3;
208 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
209 float32x4x2_t trn_f1_ = vtrnq_f32(b2, vdupq_n_f32(0.f));
210 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
211 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
212 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
213 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
220 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
221 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
222 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
223 return F128::Or(F128::Or(t0, t1), t2);
231 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 233 float32x4_t tmp0 = vmulq_f32(a, b0);
234 float32x4_t tmp1 = vmulq_f32(a, b1);
235 float32x4_t tmp2 = vmulq_f32(a, b2);
236 float32x4_t tmp3 = vmulq_f32(a, b3);
237 tmp0 = vpaddq_f32(tmp0, tmp1);
238 tmp2 = vpaddq_f32(tmp2, tmp3);
239 tmp0 = vpaddq_f32(tmp0, tmp2);
242 f128 row0, row1, row2, row3;
243 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
244 float32x4x2_t trn_f1_ = vtrnq_f32(b2, b3);
245 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
246 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
247 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
248 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
255 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
256 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
257 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
258 f128 t3 = Vector4::DotEx<false, false, false, true>(a, b3);
259 return F128::Or(F128::Or(t0, t1), F128::Or(t2, t3));
265 f128 dot = Vector4::Dot(vec, vec);
266 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 268 x = vrsqrteq_f32(dot);
269 x = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
270 f128 rsqrt = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
272 f128 rsqrt = F128::RecpSqrt(dot);
274 f128 inf = F128::SetInfinity();
275 f128 eqzero = F128::CmpEqZero(dot);
276 f128 eqinf = F128::CmpEq(dot, inf);
278 f128 nan = F128::SetNaN();
279 ret = F128::AndNot(eqzero, ret);
280 ret = F128::Select(eqinf, nan, ret);
286 return Dot(vec, vec);
291 return F128::Sqrt(Dot(vec, vec));
296 return F128::SqrtEst(Dot(vec, vec));
301 return F128::RecpSqrt(Dot(vec, vec));
306 return F128::RecpSqrtEst(Dot(vec, vec));
310 return F128::Mult(vec, RecpLengthEst(vec));
316 f128 ret = Dot(vec1_normalized, vec2_normalized);
317 ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
318 return F128::ArcCos(ret);
322 f128 s = Dot(vec, normal);
324 return F128::MultSub(s, normal, vec);
383 #endif // NLIB_DOXYGEN 388 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ static bool InBound(SimdVectorArg vec, SimdVectorArg bounds) noexcept
Evaluates whether the elements of vec are inside the bounds of bounds.
static bool CmpNearEq(SimdVectorArg vec1, SimdVectorArg vec2, SimdVectorArg eps) noexcept
Compares vec1 and vec2 to see whether they are nearly equal.
f128arg SimdVectorArg
f128arg is defined using typedef.
static bool IsNaN(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are NaN.
static bool CmpGe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or greater than the corresponding element...
static bool CmpGt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are larger than the corresponding elements of vec2...
The structure for keeping a 4x4 matrix.
static bool CmpNe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares vec1 and vec2 to see whether they are unequal.
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
The class with the collection of functions that perform calculations on four-dimensional vectors...
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
static bool CmpEq(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares two four-dimensional vectors to see if they are equal.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
static bool CmpLt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are smaller than the corresponding elements of vec2...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
static bool IsInfinite(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are positive infinity or negative infinity.
static bool CmpLe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or less than the corresponding elements o...
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...