3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_
16 template<
typename MyVector3>
19 template<
typename MyVector3>
27 static bool __vectorcall
33 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
68 #define NLIB_M(tp) inline tp __vectorcall
69 #define NLIB_MH(tp) inline tp __vectorcall
73 #ifdef NLIB_F128_SIMD_NOUSE
80 #elif defined(NLIB_SSE41)
81 __m128 x = _mm_load_ss(&p->x);
82 __m128 y = _mm_load_ss(&p->y);
83 __m128 z = _mm_load_ss(&p->z);
84 __m128 xy = _mm_unpacklo_ps(x, y);
85 return _mm_shuffle_ps(xy, z, _MM_SHUFFLE(1, 0, 1, 0));
86 #elif defined(NLIB_NEON)
87 float32x2_t xy = vld1_f32(&p->x);
88 float32x2_t z = vld1_lane_f32(&p->z, xy, 0);
89 return vcombine_f32(xy, z);
90 #elif defined(NLIB_CAFE_PPC)
92 ret.vec.ps[0][0] = p->x;
93 ret.vec.ps[0][1] = p->y;
94 ret.vec.ps[1][0] = p->z;
99 template<
typename MyVector3>
102 NLIB_MH(
SimdVector) Vector3::LoadFloat3(const MyVector3* p) NLIB_NOEXCEPT {
106 return Vector3::LoadFloat3(reinterpret_cast<const Float3*>(&p->x));
110 NLIB_M(
void) Vector3::StoreFloat3(Float3* p,
SimdVectorArg vec) NLIB_NOEXCEPT {
111 #ifdef NLIB_F128_SIMD_NOUSE
115 #elif defined(NLIB_SSE41)
116 f128 y = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1, 1, 1, 1));
117 f128 z = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(2, 2, 2, 2));
118 _mm_store_ss(&p->x, vec);
119 _mm_store_ss(&p->y, y);
120 _mm_store_ss(&p->z, z);
121 #elif defined(NLIB_NEON)
122 float32x2_t lo = vget_low_f32(vec);
124 vst1q_lane_f32(&p->z, vec, 2);
125 #elif defined(NLIB_CAFE_PPC)
126 p->x = vec.vec.ps[0][0];
127 p->y = vec.vec.ps[0][1];
128 p->z = vec.vec.ps[1][0];
132 template<
typename MyVector3>
133 NLIB_MH(
void) Vector3::StoreFloat3(MyVector3* p,
SimdVectorArg vec) NLIB_NOEXCEPT {
139 Vector3::StoreFloat3(reinterpret_cast<Float3*>(&p->x), vec);
142 #if defined(NLIB_NEON)
143 #define NLIB_NEON_CMPVector3(op, vec1, vec2) \
144 uint8x16_t cmp = vreinterpretq_u8_u32(op(vec1, vec2)); \
145 uint8x8x2_t zip = vzip_u8(vget_low_u8(cmp), vget_high_u8(cmp)); \
146 uint16x4x2_t zip2 = vzip_u16(vreinterpret_u16_u8(zip.val[0]), \
147 vreinterpret_u16_u8(zip.val[1])); \
148 return (vget_lane_u32(vreinterpret_u32_u16(zip2.val[0]), 0) & 0xFFFFFFU) == 0xFFFFFFU
153 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41)
154 int mask = F128::MoveMask(F128::CmpEq(vec1, vec2));
155 return ((mask & 7) == 7);
156 #elif defined(NLIB_NEON)
157 NLIB_NEON_CMPVector3(vceqq_f32, vec1, vec2);
158 #elif defined(NLIB_CAFE_PPC)
159 return vec1.vec.ps[0][0] == vec2.vec.ps[0][0] &&
160 vec1.vec.ps[0][1] == vec2.vec.ps[0][1] &&
161 vec1.vec.ps[1][0] == vec2.vec.ps[1][0];
167 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41)
168 int mask = F128::MoveMask(F128::CmpLt(vec1, vec2));
169 return ((mask & 7) == 7);
170 #elif defined(NLIB_NEON)
171 NLIB_NEON_CMPVector3(vcltq_f32, vec1, vec2);
172 #elif defined(NLIB_CAFE_PPC)
173 return vec1.vec.ps[0][0] < vec2.vec.ps[0][0] &&
174 vec1.vec.ps[0][1] < vec2.vec.ps[0][1] &&
175 vec1.vec.ps[1][0] < vec2.vec.ps[1][0];
181 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41)
182 int mask = F128::MoveMask(F128::CmpLe(vec1, vec2));
183 return ((mask & 7) == 7);
184 #elif defined(NLIB_NEON)
185 NLIB_NEON_CMPVector3(vcleq_f32, vec1, vec2);
186 #elif defined(NLIB_CAFE_PPC)
187 return vec1.vec.ps[0][0] <= vec2.vec.ps[0][0] &&
188 vec1.vec.ps[0][1] <= vec2.vec.ps[0][1] &&
189 vec1.vec.ps[1][0] <= vec2.vec.ps[1][0];
195 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41)
196 int mask = F128::MoveMask(F128::CmpGt(vec1, vec2));
197 return ((mask & 7) == 7);
198 #elif defined(NLIB_NEON)
199 NLIB_NEON_CMPVector3(vcgtq_f32, vec1, vec2);
200 #elif defined(NLIB_CAFE_PPC)
201 return vec1.vec.ps[0][0] > vec2.vec.ps[0][0] &&
202 vec1.vec.ps[0][1] > vec2.vec.ps[0][1] &&
203 vec1.vec.ps[1][0] > vec2.vec.ps[1][0];
209 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41)
210 int mask = F128::MoveMask(F128::CmpGe(vec1, vec2));
211 return ((mask & 7) == 7);
212 #elif defined(NLIB_NEON)
213 NLIB_NEON_CMPVector3(vcgeq_f32, vec1, vec2);
214 #elif defined(NLIB_CAFE_PPC)
215 return vec1.vec.ps[0][0] >= vec2.vec.ps[0][0] &&
216 vec1.vec.ps[0][1] >= vec2.vec.ps[0][1] &&
217 vec1.vec.ps[1][0] >= vec2.vec.ps[1][0];
223 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41)
224 int mask = F128::MoveMask(F128::CmpEq(vec1, vec2));
225 return ((mask & 7) != 7);
226 #elif defined(NLIB_NEON)
227 uint8x16_t cmp = vreinterpretq_u8_u32(vceqq_f32(vec1, vec2));
228 uint8x8x2_t zip = vzip_u8(vget_low_u8(cmp), vget_high_u8(cmp));
229 uint16x4x2_t zip2 = vzip_u16(vreinterpret_u16_u8(zip.val[0]),
230 vreinterpret_u16_u8(zip.val[1]));
231 return (vget_lane_u32(vreinterpret_u32_u16(zip2.val[0]), 0) & 0xFFFFFFU) != 0xFFFFFFU;
232 #elif defined(NLIB_CAFE_PPC)
233 return vec1.vec.ps[0][0] != vec2.vec.ps[0][0] ||
234 vec1.vec.ps[0][1] != vec2.vec.ps[0][1] ||
235 vec1.vec.ps[1][0] != vec2.vec.ps[1][0];
239 inline bool __vectorcall
241 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41) || defined(NLIB_CAFE_PPC)
242 int mask = F128::MoveMask(F128::CmpNearEq(vec1, vec2, eps));
243 return ((mask & 7) == 7);
244 #elif defined(NLIB_NEON)
245 f128 cmp_ = F128::CmpNearEq(vec1, vec2, eps);
246 uint8x16_t cmp = vreinterpretq_u8_f32(cmp_);
247 uint8x8x2_t zip = vzip_u8(vget_low_u8(cmp), vget_high_u8(cmp));
248 uint16x4x2_t zip2 = vzip_u16(vreinterpret_u16_u8(zip.val[0]),
249 vreinterpret_u16_u8(zip.val[1]));
250 return (vget_lane_u32(vreinterpret_u32_u16(zip2.val[0]), 0) & 0xFFFFFFU) == 0xFFFFFFU;
254 #if defined(NLIB_NEON)
255 #undef NLIB_NEON_CMPVector3
259 inline bool __vectorcall Vector3::IsNaN(
SimdVectorArg vec) NLIB_NOEXCEPT {
260 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41) || defined(NLIB_CAFE_PPC)
261 int mask = F128::MoveMask(F128::IsNaN(vec));
262 return ((mask & 7) == 7);
263 #elif defined(NLIB_NEON)
264 f128 cmp_ = F128::IsNaN(vec);
265 uint8x16_t cmp = vreinterpretq_u8_f32(cmp_);
266 uint8x8x2_t zip = vzip_u8(vget_low_u8(cmp), vget_high_u8(cmp));
267 uint16x4x2_t zip2 = vzip_u16(vreinterpret_u16_u8(zip.val[0]),
268 vreinterpret_u16_u8(zip.val[1]));
269 return (vget_lane_u32(vreinterpret_u32_u16(zip2.val[0]), 0) & 0xFFFFFFU) == 0xFFFFFFU;
274 inline bool __vectorcall Vector3::IsInfinite(
SimdVectorArg vec) NLIB_NOEXCEPT {
275 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41) || defined(NLIB_CAFE_PPC)
276 int mask = F128::MoveMask(F128::IsInfinite(vec));
277 return ((mask & 7) == 7);
278 #elif defined(NLIB_NEON)
279 f128 cmp_ = F128::IsInfinite(vec);
280 uint8x16_t cmp = vreinterpretq_u8_f32(cmp_);
281 uint8x8x2_t zip = vzip_u8(vget_low_u8(cmp), vget_high_u8(cmp));
282 uint16x4x2_t zip2 = vzip_u16(vreinterpret_u16_u8(zip.val[0]),
283 vreinterpret_u16_u8(zip.val[1]));
284 return (vget_lane_u32(vreinterpret_u32_u16(zip2.val[0]), 0) & 0xFFFFFFU) == 0xFFFFFFU;
290 #if defined(NLIB_F128_SIMD_NOUSE) || defined(NLIB_SSE41) || defined(NLIB_CAFE_PPC)
291 int mask = F128::MoveMask(F128::InBound(vec, bounds));
292 return ((mask & 7) == 7);
293 #elif defined(NLIB_NEON)
294 f128 cmp_ = F128::InBound(vec, bounds);
295 uint8x16_t cmp = vreinterpretq_u8_f32(cmp_);
296 uint8x8x2_t zip = vzip_u8(vget_low_u8(cmp), vget_high_u8(cmp));
297 uint16x4x2_t zip2 = vzip_u16(vreinterpret_u16_u8(zip.val[0]),
298 vreinterpret_u16_u8(zip.val[1]));
299 return (vget_lane_u32(vreinterpret_u32_u16(zip2.val[0]), 0) & 0xFFFFFFU) == 0xFFFFFFU;
305 #ifdef NLIB_F128_SIMD_NOUSE
306 float tmp = vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
307 vec1.vec.v[2] * vec2.vec.v[2];
309 rval.vec.v[0] = rval.vec.v[1] = rval.vec.v[2] = rval.vec.v[3] = tmp;
311 #elif defined(NLIB_SSE41)
312 return _mm_dp_ps(vec1, vec2, 0x7F);
313 #elif defined(NLIB_NEON)
314 f128 tmp = vmulq_f32(vec1, vec2);
315 float32x2_t lo = vget_low_f32(tmp);
316 lo = vpadd_f32(lo, lo);
317 float32x2_t hi = vdup_lane_f32(vget_high_f32(tmp), 0);
318 lo = vadd_f32(lo, hi);
319 return vcombine_f32(lo, lo);
320 #elif defined(NLIB_CAFE_PPC)
321 f128 tmp = F128::Mult(vec1, vec2);
322 f32x2 val = __PS_SUM0(tmp.vec.ps[0], tmp.vec.ps[0], tmp.vec.ps[0]);
323 val = __PS_ADD(val, tmp.vec.ps[1]);
325 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
330 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
333 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41)
334 return _mm_dp_ps(vec1, vec2,
335 (0x70 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) |
336 (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
338 return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
344 SimdVector rval = F128::Mult(F128::Swizzle<1, 2, 0, 1>(vec1), F128::Swizzle<2, 0, 1, 1>(vec2));
346 rval = F128::MultSub(F128::Swizzle<2, 0, 1, 1>(vec1), F128::Swizzle<1, 2, 0, 1>(vec2), rval);
355 f128 dot = Vector3::Dot(vec, vec);
356 f128 rsqrt = F128::RecpSqrt(dot);
357 f128 inf = F128::SetInfinity();
358 f128 zero = F128::SetZero();
359 f128 eqzero = F128::CmpEq(dot, zero);
360 f128 eqinf = F128::CmpEq(dot, inf);
362 f128 nan = F128::SetNaN();
363 ret = F128::Select(eqzero, zero, ret);
364 ret = F128::Select(eqinf, nan, ret);
369 NLIB_M(
f128) Vector3::LengthSq(
SimdVectorArg vec) NLIB_NOEXCEPT {
return Dot(vec, vec); }
373 return F128::Sqrt(Dot(vec, vec));
378 return F128::SqrtEst(Dot(vec, vec));
383 return F128::RecpSqrt(Dot(vec, vec));
388 return F128::RecpSqrtEst(Dot(vec, vec));
392 return F128::Mult(vec, RecpLengthEst(vec));
398 f128 ret = Dot(vec1_normalized, vec2_normalized);
399 ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
400 return F128::ArcCos(ret);
404 f128 s = Dot(vec, normal);
406 return F128::MultSub(s, normal, vec);
422 return F128::Add(m.r[3], ret);
427 f128 tmp = Vector3::Transform(vec, m);
428 return F128::Div(tmp, F128::SetValue<3>(tmp,
each_select32));
451 return Quaternion::Mult(Quaternion::Mult(conj, v), q_normalized);
458 return Quaternion::Mult(Quaternion::Mult(q_normalized, v), conj);
464 #endif // NLIB_DOXYGEN
469 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR3_H_
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
f128arg SimdVectorArg
f128arg is defined using typedef.
f128arg SimdQuaternionArg
f128arg is defined using typedef.
The class with the collection of functions that perform calculations on three-dimensional vectors...
The structure for keeping a 4x4 matrix.
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
The type for reading and writing three-dimensional vectors in memory. Keeps float-type x...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
nlib_f128_t f128
nlib_f128_t is is defined using typedef.
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
Defines a four-dimensional vector.
f128 SimdQuaternion
f128 is defined using typedef. Used when handling quaternions.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...