3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_ 4 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_ 16 template<
typename MyVector3>
19 template<
typename MyVector3>
27 static bool __vectorcall
33 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
38 static float __vectorcall Normalize(
SimdVector* normalized,
40 static float __vectorcall NormalizeEst(
SimdVector* normalized,
72 #define NLIB_M(tp) inline tp __vectorcall 73 #define NLIB_MH(tp) inline tp __vectorcall 77 #ifdef NLIB_F128_SIMD_NOUSE 84 #elif defined(NLIB_SSE41) 85 __m128 x = _mm_load_ss(&p->
x);
86 __m128 y = _mm_load_ss(&p->
y);
87 __m128 z = _mm_load_ss(&p->
z);
88 __m128 xy = _mm_unpacklo_ps(x, y);
89 return _mm_shuffle_ps(xy, z, _MM_SHUFFLE(1, 0, 1, 0));
90 #elif defined(NLIB_NEON) 91 float32x2_t xy = vld1_f32(&p->
x);
92 float32x2_t z = vld1_lane_f32(&p->
z, xy, 0);
93 return vcombine_f32(xy, z);
96 ret.vec.ps[0][0] = p->
x;
97 ret.vec.ps[0][1] = p->
y;
98 ret.vec.ps[1][0] = p->
z;
103 template<
typename MyVector3>
110 return Vector3::LoadFloat3(reinterpret_cast<const Float3*>(&p->x));
115 #ifdef NLIB_F128_SIMD_NOUSE 119 #elif defined(NLIB_SSE41) 120 f128 y = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1));
121 f128 z = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2));
122 _mm_store_ss(&p->
x, vec);
123 _mm_store_ss(&p->
y, y);
124 _mm_store_ss(&p->
z, z);
125 #elif defined(NLIB_NEON) 126 float32x2_t lo = vget_low_f32(vec);
128 vst1q_lane_f32(&p->
z, vec, 2);
130 p->
x = vec.vec.ps[0][0];
131 p->
y = vec.vec.ps[0][1];
132 p->
z = vec.vec.ps[1][0];
136 template<
typename MyVector3>
143 Vector3::StoreFloat3(reinterpret_cast<Float3*>(&p->x), vec);
148 f128 mask = F128::CmpEq(vec1, vec2);
149 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
154 f128 mask = F128::CmpLt(vec1, vec2);
155 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
160 f128 mask = F128::CmpLe(vec1, vec2);
161 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
166 f128 mask = F128::CmpGt(vec1, vec2);
167 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
172 f128 mask = F128::CmpGe(vec1, vec2);
173 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
178 f128 mask = F128::CmpNe(vec1, vec2);
179 return !F128::IsAllMaskFalse(F128::Swizzle<0, 1, 2, 2>(mask));
182 inline bool __vectorcall
184 f128 mask = F128::CmpNearEq(vec1, vec2, eps);
185 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
190 f128 mask = F128::IsNaN(vec);
191 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
196 f128 mask = F128::IsInfinite(vec);
197 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
202 f128 mask = F128::InBound(vec, bounds);
203 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
208 #ifdef NLIB_F128_SIMD_NOUSE 209 float tmp = vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
210 vec1.vec.v[2] * vec2.vec.v[2];
212 rval.vec.v[0] = rval.vec.v[1] = rval.vec.v[2] = rval.vec.v[3] = tmp;
214 #elif defined(NLIB_SSE41) 215 return _mm_dp_ps(vec1, vec2, 0x7F);
216 #elif defined(NLIB_NEON) 218 float32x4_t tmp = vmulq_f32(vec1, vec2);
219 tmp = F128::Permute<0, 1, 2, 6>(tmp, vdupq_n_f32(0.f));
220 tmp = vpaddq_f32(tmp, tmp);
221 tmp = vpaddq_f32(tmp, tmp);
224 f128 tmp = vmulq_f32(vec1, vec2);
225 float32x2_t lo = vget_low_f32(tmp);
226 lo = vpadd_f32(lo, lo);
227 float32x2_t hi = vdup_lane_f32(vget_high_f32(tmp), 0);
228 lo = vadd_f32(lo, hi);
229 return vcombine_f32(lo, lo);
232 f128 tmp = F128::Mult(vec1, vec2);
233 f32x2 val = __PS_SUM0(tmp.vec.ps[0], tmp.vec.ps[0], tmp.vec.ps[0]);
234 val = __PS_ADD(val, tmp.vec.ps[1]);
236 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
241 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
244 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41) 245 return _mm_dp_ps(vec1, vec2,
246 (0x70 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) |
247 (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
249 return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
255 SimdVector rval = F128::Mult(F128::Swizzle<1, 2, 0, -1>(vec1),
256 F128::Swizzle<2, 0, 1, -1>(vec2));
258 rval = F128::MultSub(F128::Swizzle<2, 0, 1, -1>(vec1), F128::Swizzle<1, 2, 0, -1>(vec2), rval);
265 f128 dot = Vector3::Dot(vec, vec);
266 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 268 x = vrsqrteq_f32(dot);
269 x = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
270 f128 rsqrt = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
272 f128 rsqrt = F128::RecpSqrt(dot);
274 f128 inf = F128::SetInfinity();
275 f128 eqzero = F128::CmpEqZero(dot);
276 f128 eqinf = F128::CmpEq(dot, inf);
278 f128 nan = F128::SetNaN();
279 ret = F128::AndNot(eqzero, ret);
280 ret = F128::Select(eqinf, nan, ret);
285 f128 dot = Vector3::Dot(vec, vec);
286 f128 rsqrt = F128::RecpSqrt(dot);
287 *normalized = F128::Mult(vec, rsqrt);
288 return F128::GetFloatFromLane<0>(dot);
296 return F128::Sqrt(Dot(vec, vec));
301 return F128::SqrtEst(Dot(vec, vec));
306 return F128::RecpSqrt(Dot(vec, vec));
311 return F128::RecpSqrtEst(Dot(vec, vec));
315 return F128::Mult(vec, RecpLengthEst(vec));
319 f128 dot = Vector3::Dot(vec, vec);
320 f128 rsqrt = F128::RecpSqrtEst(dot);
321 *normalized = F128::Mult(vec, rsqrt);
322 return F128::GetFloatFromLane<0>(dot);
328 f128 ret = Dot(vec1_normalized, vec2_normalized);
329 ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
330 return F128::ArcCos(ret);
334 f128 s = Dot(vec, normal);
336 return F128::MultSub(s, normal, vec);
352 return F128::Add(m.
r[3], ret);
357 f128 tmp = Vector3::Transform(vec, m);
358 return F128::Div(tmp, F128::SetValue<3>(tmp,
each_select32));
381 return Quaternion::Mult(Quaternion::Mult(conj, v), q_normalized);
388 return Quaternion::Mult(Quaternion::Mult(q_normalized, v), conj);
394 #endif // NLIB_DOXYGEN 399 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_ float x
The x-coordinate of the 3D vector.
f128arg SimdVectorArg
f128arg is defined using typedef.
f128arg SimdQuaternionArg
f128arg is defined using typedef.
The class with the collection of functions that perform calculations on three-dimensional vectors...
f128 r[4]
Keeps each row of a 4x4 matrix.
The structure for keeping a 4x4 matrix.
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
float y
The y-coordinate of the 3D vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
float z
The z-coordinate of the 3D vector.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Defines a four-dimensional vector.
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...