3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ 4 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ 13 template<
typename MyVector4>
15 template<
typename MyVector4>
18 return F128::IsAllMaskTrue(F128::CmpEq(vec1, vec2));
21 return F128::IsAllMaskTrue(F128::CmpLt(vec1, vec2));
24 return F128::IsAllMaskTrue(F128::CmpLe(vec1, vec2));
27 return F128::IsAllMaskTrue(F128::CmpGt(vec1, vec2));
30 return F128::IsAllMaskTrue(F128::CmpGe(vec1, vec2));
33 return F128::IsAllMaskTrue(F128::CmpNe(vec1, vec2));
35 static bool __vectorcall
37 return F128::IsAllMaskTrue(F128::CmpNearEq(vec1, vec2, eps));
40 return !F128::IsAllMaskFalse(F128::IsNaN(vec));
43 return !F128::IsAllMaskFalse(F128::IsInfinite(vec));
46 return F128::IsAllMaskTrue(F128::InBound(vec, bounds));
50 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
52 static f128 __vectorcall
54 static f128 __vectorcall
80 #define NLIB_M(tp) inline tp __vectorcall 81 #define NLIB_MH(tp) inline tp __vectorcall 83 template<
typename MyVector4>
91 return F128::LoadA4(reinterpret_cast<const float*>(&p->x));
94 template<
typename MyVector4>
102 F128::StoreA4(reinterpret_cast<float*>(&p->x), vec);
107 #ifdef NLIB_F128_SIMD_NOUSE 109 ret.vec.v[0] = ret.vec.v[1] = ret.vec.v[2] = ret.vec.v[3] =
110 vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
111 vec1.vec.v[2] * vec2.vec.v[2] + vec1.vec.v[3] * vec2.vec.v[3];
113 #elif defined(NLIB_SSE41) 114 return _mm_dp_ps(vec1, vec2, 0xFF);
115 #elif defined(NLIB_NEON) 117 float32x4_t tmp = vmulq_f32(vec1, vec2);
118 tmp = vpaddq_f32(tmp, tmp);
119 tmp = vpaddq_f32(tmp, tmp);
122 float32x4_t tmp = vmulq_f32(vec1, vec2);
123 float32x2_t v1 = vget_low_f32(tmp);
124 float32x2_t v2 = vget_high_f32(tmp);
125 v1 = vpadd_f32(v1, v1);
126 v2 = vpadd_f32(v2, v2);
127 v1 = vadd_f32(v1, v2);
128 return vcombine_f32(v1, v1);
131 f128 tmp = F128::Mult(vec1, vec2);
132 f32x2 val = __PS_ADD(tmp.vec.ps[0], tmp.vec.ps[1]);
133 val = __PS_SUM0(val, val, val);
135 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
140 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
143 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41) 144 return _mm_dp_ps(vec1, vec2, (0xF0 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) |
145 (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
147 return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
153 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 155 float32x4_t zero = vdupq_n_f32(0);
156 float32x4_t tmp0 = vmulq_f32(a, b0);
157 float32x4_t tmp1 = vmulq_f32(a, b1);
158 tmp0 = vpaddq_f32(tmp0, tmp1);
159 tmp0 = vpaddq_f32(tmp0, zero);
162 float32x4x2_t trn = vtrnq_f32(b0, b1);
163 float32x2_t lo = vget_low_f32(a);
164 float32x2_t hi = vget_high_f32(a);
165 float32x4_t xxzz = vcombine_f32(vdup_lane_f32(lo, 0), vdup_lane_f32(hi, 0));
166 float32x4_t yyww = vcombine_f32(vdup_lane_f32(lo, 1), vdup_lane_f32(hi, 1));
167 float32x4_t tmp = vmulq_f32(trn.val[0], xxzz);
168 tmp = vmlaq_f32(tmp, yyww, trn.val[1]);
169 float32x2_t result = vadd_f32(vget_low_f32(tmp), vget_high_f32(tmp));
170 return vcombine_f32(result, vdup_n_f32(0.f));
173 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
174 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
175 return F128::Or(t0, t1);
182 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 184 float32x4_t zero = vdupq_n_f32(0);
185 float32x4_t tmp0 = vmulq_f32(a, b0);
186 float32x4_t tmp1 = vmulq_f32(a, b1);
187 float32x4_t tmp2 = vmulq_f32(a, b2);
188 tmp0 = vpaddq_f32(tmp0, tmp1);
189 tmp2 = vpaddq_f32(tmp2, zero);
190 tmp0 = vpaddq_f32(tmp0, tmp2);
193 f128 row0, row1, row2, row3;
194 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
195 float32x4x2_t trn_f1_ = vtrnq_f32(b2, vdupq_n_f32(0.f));
196 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
197 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
198 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
199 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
206 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
207 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
208 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
209 return F128::Or(F128::Or(t0, t1), t2);
216 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON) 218 float32x4_t tmp0 = vmulq_f32(a, b0);
219 float32x4_t tmp1 = vmulq_f32(a, b1);
220 float32x4_t tmp2 = vmulq_f32(a, b2);
221 float32x4_t tmp3 = vmulq_f32(a, b3);
222 tmp0 = vpaddq_f32(tmp0, tmp1);
223 tmp2 = vpaddq_f32(tmp2, tmp3);
224 tmp0 = vpaddq_f32(tmp0, tmp2);
227 f128 row0, row1, row2, row3;
228 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
229 float32x4x2_t trn_f1_ = vtrnq_f32(b2, b3);
230 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
231 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
232 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
233 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
240 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
241 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
242 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
243 f128 t3 = Vector4::DotEx<false, false, false, true>(a, b3);
244 return F128::Or(F128::Or(t0, t1), F128::Or(t2, t3));
250 f128 dot = Vector4::Dot(vec, vec);
251 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 253 x = vrsqrteq_f32(dot);
254 x = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
255 f128 rsqrt = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
257 f128 rsqrt = F128::RecpSqrt(dot);
259 f128 inf = F128::SetInfinity();
260 f128 eqzero = F128::CmpEqZero(dot);
261 f128 eqinf = F128::CmpEq(dot, inf);
263 f128 nan = F128::SetNaN();
264 ret = F128::AndNot(eqzero, ret);
265 ret = F128::Select(eqinf, nan, ret);
274 return F128::Sqrt(Dot(vec, vec));
279 return F128::SqrtEst(Dot(vec, vec));
284 return F128::RecpSqrt(Dot(vec, vec));
289 return F128::RecpSqrtEst(Dot(vec, vec));
293 return F128::Mult(vec, RecpLengthEst(vec));
299 f128 ret = Dot(vec1_normalized, vec2_normalized);
300 ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
301 return F128::ArcCos(ret);
305 f128 s = Dot(vec, normal);
307 return F128::MultSub(s, normal, vec);
366 #endif // NLIB_DOXYGEN 371 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_ static bool InBound(SimdVectorArg vec, SimdVectorArg bounds) noexcept
Evaluates whether the elements of vec are inside the bounds of bounds.
static bool CmpNearEq(SimdVectorArg vec1, SimdVectorArg vec2, SimdVectorArg eps) noexcept
Compares vec1 and vec2 to see whether they are nearly equal.
f128arg SimdVectorArg
f128arg is defined using typedef.
static bool IsNaN(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are NaN.
static bool CmpGe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or greater than the corresponding element...
static bool CmpGt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are larger than the corresponding elements of vec2...
f128 r[4]
Keeps each row of a 4x4 matrix.
The structure for keeping a 4x4 matrix.
static bool CmpNe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares vec1 and vec2 to see whether they are unequal.
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
The class with the collection of functions that perform calculations on four-dimensional vectors...
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
static bool CmpEq(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares two four-dimensional vectors to see if they are equal.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
static bool CmpLt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are smaller than the corresponding elements of vec2...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
static bool IsInfinite(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are positive infinity or negative infinity.
static bool CmpLe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or less than the corresponding elements o...
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...