16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_ 29 template<
typename MyVector3>
32 template<
typename MyVector3>
46 template<
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
82 #define NLIB_M(tp) inline tp __vectorcall 83 #define NLIB_MH(tp) inline tp __vectorcall 87 #ifdef NLIB_F128_SIMD_NOUSE 94 #elif defined(NLIB_SSE41) 95 __m128 x = _mm_load_ss(&p->x);
96 __m128 y = _mm_load_ss(&p->y);
97 __m128
z = _mm_load_ss(&p->z);
98 __m128 xy = _mm_unpacklo_ps(x, y);
99 return _mm_shuffle_ps(xy,
z, _MM_SHUFFLE(1, 0, 1, 0));
100 #elif defined(NLIB_NEON) 101 float32x2_t xy = vld1_f32(&p->x);
102 float32x2_t
z = vld1_lane_f32(&p->z, xy, 0);
103 return vcombine_f32(xy,
z);
106 ret.vec.ps[0][0] = p->x;
107 ret.vec.ps[0][1] = p->y;
108 ret.vec.ps[1][0] = p->z;
113 template<
typename MyVector3>
120 return Vector3::LoadFloat3(reinterpret_cast<const Float3*>(&p->x));
125 #ifdef NLIB_F128_SIMD_NOUSE 129 #elif defined(NLIB_SSE41) 130 f128 y = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1));
131 f128 z = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2));
132 _mm_store_ss(&p->x, vec);
133 _mm_store_ss(&p->y, y);
134 _mm_store_ss(&p->z, z);
135 #elif defined(NLIB_NEON) 136 float32x2_t lo = vget_low_f32(vec);
138 vst1q_lane_f32(&p->z, vec, 2);
140 p->x = vec.vec.ps[0][0];
141 p->y = vec.vec.ps[0][1];
142 p->z = vec.vec.ps[1][0];
146 template<
typename MyVector3>
154 Vector3::StoreFloat3(reinterpret_cast<Float3*>(&p->x), vec);
159 f128 mask = F128::CmpEq(vec1, vec2);
160 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
165 f128 mask = F128::CmpLt(vec1, vec2);
166 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
171 f128 mask = F128::CmpLe(vec1, vec2);
172 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
177 f128 mask = F128::CmpGt(vec1, vec2);
178 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
183 f128 mask = F128::CmpGe(vec1, vec2);
184 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
189 f128 mask = F128::CmpNe(vec1, vec2);
190 return !F128::IsAllMaskFalse(F128::Swizzle<0, 1, 2, 2>(mask));
195 f128 mask = F128::CmpNearEq(vec1, vec2, eps);
196 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
201 f128 mask = F128::IsNaN(vec);
202 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
207 f128 mask = F128::IsInfinite(vec);
208 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
213 f128 mask = F128::InBound(vec, bounds);
214 return F128::IsAllMaskTrue(F128::Swizzle<0, 1, 2, 2>(mask));
219 #ifdef NLIB_F128_SIMD_NOUSE 220 float tmp = vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
221 vec1.vec.v[2] * vec2.vec.v[2];
223 rval.vec.v[0] = rval.vec.v[1] = rval.vec.v[2] = rval.vec.v[3] = tmp;
225 #elif defined(NLIB_SSE41) 226 return _mm_dp_ps(vec1, vec2, 0x7F);
227 #elif defined(NLIB_NEON) 229 float32x4_t tmp = vmulq_f32(vec1, vec2);
230 tmp = F128::Permute<0, 1, 2, 6>(tmp, vdupq_n_f32(0.f));
231 tmp = vpaddq_f32(tmp, tmp);
232 tmp = vpaddq_f32(tmp, tmp);
235 f128 tmp = vmulq_f32(vec1, vec2);
236 float32x2_t lo = vget_low_f32(tmp);
237 lo = vpadd_f32(lo, lo);
238 float32x2_t hi = vdup_lane_f32(vget_high_f32(tmp), 0);
239 lo = vadd_f32(lo, hi);
240 return vcombine_f32(lo, lo);
243 f128 tmp = F128::Mult(vec1, vec2);
244 f32x2 val = __PS_SUM0(tmp.vec.ps[0], tmp.vec.ps[0], tmp.vec.ps[0]);
245 val = __PS_ADD(val, tmp.vec.ps[1]);
247 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
252 template<
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
255 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41) 258 (0x70 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) | (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
260 return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
267 F128::Mult(F128::Swizzle<1, 2, 0, -1>(vec1), F128::Swizzle<2, 0, 1, -1>(vec2));
269 rval = F128::MultSub(F128::Swizzle<2, 0, 1, -1>(vec1), F128::Swizzle<1, 2, 0, -1>(vec2), rval);
276 f128 dot = Vector3::Dot(vec, vec);
277 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 279 x = vrsqrteq_f32(dot);
280 x = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
281 f128 rsqrt = vmulq_f32(x, vrsqrtsq_f32(dot, vmulq_f32(x, x)));
283 f128 rsqrt = F128::RecpSqrt(dot);
285 f128 inf = F128::SetInfinity();
286 f128 eqzero = F128::CmpEqZero(dot);
287 f128 eqinf = F128::CmpEq(dot, inf);
289 f128 nan = F128::SetNaN();
290 ret = F128::AndNot(eqzero, ret);
291 ret = F128::Select(eqinf, nan, ret);
296 f128 dot = Vector3::Dot(vec, vec);
297 f128 rsqrt = F128::RecpSqrt(dot);
298 *normalized = F128::Mult(vec, rsqrt);
299 return F128::GetFloatFromLane<0>(dot);
304 return Dot(vec, vec);
309 return F128::Sqrt(Dot(vec, vec));
314 return F128::SqrtEst(Dot(vec, vec));
319 return F128::RecpSqrt(Dot(vec, vec));
324 return F128::RecpSqrtEst(Dot(vec, vec));
328 return F128::Mult(vec, RecpLengthEst(vec));
332 f128 dot = Vector3::Dot(vec, vec);
333 f128 rsqrt = F128::RecpSqrtEst(dot);
334 *normalized = F128::Mult(vec, rsqrt);
335 return F128::GetFloatFromLane<0>(dot);
341 f128 ret = Dot(vec1_normalized, vec2_normalized);
342 ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
343 return F128::ArcCos(ret);
347 f128 s = Dot(vec, normal);
349 return F128::MultSub(s, normal, vec);
365 return F128::Add(m.r[3], ret);
370 f128 tmp = Vector3::Transform(vec, m);
371 return F128::Div(tmp, F128::SetValue<3>(tmp,
each_select32));
394 return Quaternion::Mult(Quaternion::Mult(conj, v), q_normalized);
401 return Quaternion::Mult(Quaternion::Mult(q_normalized, v), conj);
407 #endif // NLIB_DOXYGEN 412 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_
f128arg SimdVectorArg
f128arg is defined using typedef.
f128arg SimdQuaternionArg
f128arg is defined using typedef.
The class with the collection of functions that perform calculations on three-dimensional vectors...
The structure for keeping a 4x4 matrix.
A dummy class provided for the convenience of creating a document, which does not exist...
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
nlib_f128_t f128
nlib_f128_t is defined using typedef.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Defines a four-dimensional vector.
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...