3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
37 static SimdMatrix __vectorcall FromScaling(
float scale_x,
float scale_y,
40 static SimdMatrix __vectorcall FromTranslation(
float ofs_x,
float ofs_y,
64 PerspectiveLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
66 PerspectiveRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
67 static SimdMatrix __vectorcall PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
68 float aspect,
float near_z,
70 static SimdMatrix __vectorcall PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
71 float aspect,
float near_z,
73 static SimdMatrix __vectorcall PerspectiveOffCenterLh(
float left,
float right,
float bottom,
74 float top,
float near_z,
76 static SimdMatrix __vectorcall PerspectiveOffCenterRh(
float left,
float right,
float bottom,
77 float top,
float near_z,
81 OrthographicLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
83 OrthographicRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
84 static SimdMatrix __vectorcall OrthographicOffCenterLh(
float left,
float right,
float bottom,
85 float top,
float near_z,
87 static SimdMatrix __vectorcall OrthographicOffCenterRh(
float left,
float right,
float bottom,
88 float top,
float near_z,
104 #define NLIB_M(tp) inline tp __vectorcall
108 m.
r[0] = F128::LoadA16(&p->m[0][0]);
109 m.
r[1] = F128::LoadA16(&p->m[1][0]);
110 m.
r[2] = F128::LoadA16(&p->m[2][0]);
111 m.
r[3] = F128::LoadA16(&p->m[3][0]);
116 NLIB_M(SimdMatrix) Matrix::LoadFloat3x4(const Float3x4* p)
NLIB_NOEXCEPT {
118 m.r[0] = F128::LoadA16(&p->m[0][0]);
119 m.r[1] = F128::LoadA16(&p->m[1][0]);
120 m.r[2] = F128::LoadA16(&p->m[2][0]);
121 m.r[3] = F128::LoadA16(F128::v0001_);
126 NLIB_M(SimdMatrix) Matrix::LoadFloat4x3(const Float4x3* p)
NLIB_NOEXCEPT {
127 f128 t0 = F128::LoadA16(&p->m[0][0]);
128 f128 t1 = F128::LoadA16(&p->m[1][1]);
129 f128 t2 = F128::LoadA16(&p->m[2][2]);
131 m.r[0] = F128::SetZeroToLane<3>(t0);
132 f128 tmp1 = F128::Permute<3, 4, 5, 8>(t0, t1);
133 m.r[1] = F128::SetZeroToLane<3>(tmp1);
134 f128 tmp2 = F128::Permute<2, 3, 4, 8>(t1, t2);
135 m.r[2] = F128::SetZeroToLane<3>(tmp2);
136 m.r[3] = F128::Permute<1, 2, 3, 7>(t2, F128::SetOne());
140 NLIB_M(SimdMatrix) Matrix::LoadFloat3x3(const Float3x3* p)
NLIB_NOEXCEPT {
141 f128 t0 = F128::LoadA4(&p->m[0][0]);
142 f128 t1 = F128::LoadA4(&p->m[1][0]);
143 f128 t2 = F128::LoadA4(&p->m[1][2]);
144 f128 zero = F128::SetZero();
146 m.r[0] = F128::SetZeroToLane<3>(t0);
147 m.r[1] = F128::SetZeroToLane<3>(t1);
148 m.r[2] = F128::Permute<1, 2, 3, 7>(t2, zero);
149 m.r[3] = F128::LoadA16(F128::v0001_);
153 inline void __vectorcall Matrix::StoreFloat4x4(Float4x4* p, SimdMatrixArg m)
NLIB_NOEXCEPT {
154 F128::StoreA16(&p->m[0][0], m.r[0]);
155 F128::StoreA16(&p->m[1][0], m.r[1]);
156 F128::StoreA16(&p->m[2][0], m.r[2]);
157 F128::StoreA16(&p->m[3][0], m.r[3]);
160 inline void __vectorcall Matrix::StoreFloat4x3(Float4x3* p, SimdMatrixArg m)
NLIB_NOEXCEPT {
161 f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
162 f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
163 f128 t2 = F128::Permute<2, 4, 5, 6>(m.r[2], m.r[3]);
164 F128::StoreA16(&p->m[0][0], t0);
165 F128::StoreA16(&p->m[1][1], t1);
166 F128::StoreA16(&p->m[2][2], t2);
169 inline void __vectorcall Matrix::StoreFloat3x3(Float3x3* p, SimdMatrixArg m)
NLIB_NOEXCEPT {
170 f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
171 f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
172 F128::StoreA4(&p->m[0][0], t0);
173 F128::StoreA4(&p->m[1][1], t1);
174 p->m[2][2] = F128::GetFloatFromLane<2>(m.r[2]);
192 f128 c0det, c1det, c2det;
194 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(m.r[2]);
195 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(m.r[3]);
197 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(m.r[2]);
198 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(m.r[3]);
200 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(m.r[2]);
201 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(m.r[3]);
203 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
204 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
205 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
207 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
208 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
209 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
212 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(m.r[1]);
213 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(m.r[1]);
214 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(m.r[1]);
215 f128 r0x = F128::NegateEx<true, false, true, false>(m.r[0]);
217 f128 det3_neg = F128::Mult(c1det, ccbb_1);
218 det3_neg = F128::MultSub(c0det, baaa_1, det3_neg);
219 det3_neg = F128::MultSub(c2det, dddc_1, det3_neg);
220 return Vector4::Dot(r0x, det3_neg);
230 m.r[0] = F128::LoadA16(F128::v1000_);
231 m.r[1] = F128::LoadA16(F128::v0100_);
232 m.r[2] = F128::LoadA16(F128::v0010_);
233 m.r[3] = F128::LoadA16(F128::v0001_);
239 NLIB_M(SimdMatrix) Matrix::Transpose(SimdMatrixArg m)
NLIB_NOEXCEPT {
241 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
242 float32x4x2_t trn_f0 = vtrnq_f32(m.r[0], m.r[1]);
243 float32x4x2_t trn_f1 = vtrnq_f32(m.r[2], m.r[3]);
244 ret.r[0] = vcombine_f32(vget_low_f32(trn_f0.val[0]), vget_low_f32(trn_f1.val[0]));
245 ret.r[1] = vcombine_f32(vget_low_f32(trn_f0.val[1]), vget_low_f32(trn_f1.val[1]));
246 ret.r[2] = vcombine_f32(vget_high_f32(trn_f0.val[0]), vget_high_f32(trn_f1.val[0]));
247 ret.r[3] = vcombine_f32(vget_high_f32(trn_f0.val[1]), vget_high_f32(trn_f1.val[1]));
249 #elif !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_CAFE_PPC)
252 tmp0 = __PS_MERGE00(m.r[0].vec.ps[0], m.r[1].vec.ps[0]);
253 tmp1 = __PS_MERGE11(m.r[0].vec.ps[0], m.r[1].vec.ps[0]);
254 ret.r[0].vec.ps[0] = tmp0;
255 ret.r[1].vec.ps[0] = tmp1;
257 tmp0 = __PS_MERGE00(m.r[2].vec.ps[1], m.r[3].vec.ps[1]);
258 tmp1 = __PS_MERGE11(m.r[2].vec.ps[1], m.r[3].vec.ps[1]);
259 ret.r[2].vec.ps[1] = tmp0;
260 ret.r[3].vec.ps[1] = tmp1;
262 tmp0 = __PS_MERGE00(m.r[0].vec.ps[1], m.r[1].vec.ps[1]);
263 tmp1 = __PS_MERGE11(m.r[0].vec.ps[1], m.r[1].vec.ps[1]);
264 ret.r[2].vec.ps[0] = tmp0;
265 ret.r[3].vec.ps[0] = tmp1;
267 tmp0 = __PS_MERGE00(m.r[2].vec.ps[0], m.r[3].vec.ps[0]);
268 tmp1 = __PS_MERGE11(m.r[2].vec.ps[0], m.r[3].vec.ps[0]);
269 ret.r[0].vec.ps[1] = tmp0;
270 ret.r[1].vec.ps[1] = tmp1;
273 f128 tmp0 = F128::Permute<0, 1, 4, 5>(m.r[0], m.r[1]);
274 f128 tmp2 = F128::Permute<2, 3, 6, 7>(m.r[0], m.r[1]);
275 f128 tmp1 = F128::Permute<0, 1, 4, 5>(m.r[2], m.r[3]);
276 f128 tmp3 = F128::Permute<2, 3, 6, 7>(m.r[2], m.r[3]);
277 ret.r[0] = F128::Permute<0, 2, 4, 6>(tmp0, tmp1);
278 ret.r[1] = F128::Permute<1, 3, 5, 7>(tmp0, tmp1);
279 ret.r[2] = F128::Permute<0, 2, 4, 6>(tmp2, tmp3);
280 ret.r[3] = F128::Permute<1, 3, 5, 7>(tmp2, tmp3);
285 inline void __vectorcall Matrix::StoreFloat3x4(Float3x4* p, SimdMatrixArg m) NLIB_NOEXCEPT {
286 SimdMatrix M = Matrix::Transpose(m);
287 F128::StoreA16(&p->m[0][0], M.r[0]);
288 F128::StoreA16(&p->m[1][0], M.r[1]);
289 F128::StoreA16(&p->m[2][0], M.r[2]);
293 NLIB_M(SimdMatrix) Matrix::Inverse(
SimdVector* det, SimdMatrixArg m) NLIB_NOEXCEPT {
294 SimdMatrix M = Transpose(m);
318 f128 detValueReciprocal;
321 f128 c0det, c1det, c2det;
323 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
324 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
326 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
327 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
329 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
330 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
332 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
333 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
334 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
336 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
337 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
338 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
341 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
342 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
343 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
344 f128 r0x = F128::NegateEx<true, false, true, false>(M.r[0]);
346 f128 det3 = F128::Mult(c1det, ccbb_1);
347 det3 = F128::MultSub(c0det, baaa_1, det3);
348 det3 = F128::MultSub(c2det, dddc_1, det3);
350 detValueReciprocal = Vector4::Dot(r0x, det3);
352 *det = detValueReciprocal;
355 det3 = F128::NegateEx<true, false, true, false>(det3);
356 detValueReciprocal = F128::Recp(detValueReciprocal);
358 ret.r[0] = F128::Mult(detValueReciprocal, det3);
361 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
362 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
363 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
365 f128 det3 = F128::Mult(c0det, baaa_0);
366 det3 = F128::MultAdd(c2det, dddc_0, det3);
367 det3 = F128::MultSub(c1det, ccbb_0, det3);
368 det3 = F128::NegateEx<true, false, true, false>(det3);
369 ret.r[1] = F128::Mult(detValueReciprocal, det3);
385 f128 c0det, c1det, c2det;
387 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
388 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
390 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
391 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
393 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
394 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
396 f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
397 f128 tmp1 = F128::Mult(baaa_0, dddc_1);
398 f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
400 c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
401 c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
402 c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
405 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
406 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
407 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
409 f128 det3 = F128::Mult(c1det, ccbb_3);
410 det3 = F128::MultSub(c0det, baaa_3, det3);
411 det3 = F128::MultSub(c2det, dddc_3, det3);
412 det3 = F128::NegateEx<true, false, true, false>(det3);
414 ret.r[2] = F128::Mult(detValueReciprocal, det3);
417 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
418 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
419 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
421 f128 det3 = F128::Mult(c0det, baaa_2);
422 det3 = F128::MultAdd(c2det, dddc_2, det3);
423 det3 = F128::MultSub(c1det, ccbb_2, det3);
424 det3 = F128::NegateEx<true, false, true, false>(det3);
426 ret.r[3] = F128::Mult(detValueReciprocal, det3);
434 NLIB_M(
bool) Matrix::IsIdentity(SimdMatrixArg m) NLIB_NOEXCEPT {
435 f128 cmp0 = F128::CmpEq(m.r[0], F128::LoadA16(F128::v1000_));
436 f128 cmp1 = F128::CmpEq(m.r[1], F128::LoadA16(F128::v0100_));
437 f128 cmp2 = F128::CmpEq(m.r[2], F128::LoadA16(F128::v0010_));
438 f128 cmp3 = F128::CmpEq(m.r[3], F128::LoadA16(F128::v0001_));
439 cmp0 = F128::And(cmp0, cmp1);
440 cmp2 = F128::And(cmp2, cmp3);
441 cmp0 = F128::And(cmp0, cmp2);
442 return F128::IsAllMaskTrue(cmp0);
447 NLIB_M(
bool) Matrix::IsInfinite(SimdMatrixArg m) NLIB_NOEXCEPT {
448 #ifdef NLIB_F128_SIMD_NOUSE
449 f128 cmp0 = F128::IsInfinite(m.r[0]);
450 f128 cmp1 = F128::IsInfinite(m.r[1]);
451 f128 cmp2 = F128::IsInfinite(m.r[2]);
452 f128 cmp3 = F128::IsInfinite(m.r[3]);
453 cmp0 = F128::Or(cmp0, cmp1);
454 cmp2 = F128::Or(cmp2, cmp3);
455 cmp0 = F128::Or(cmp0, cmp2);
456 return !F128::IsAllMaskFalse(cmp0);
458 f128 inf_value = F128::SetInfinity();
459 f128 cmp0 = F128::CmpEq(inf_value, F128::Abs(m.r[0]));
460 f128 cmp1 = F128::CmpEq(inf_value, F128::Abs(m.r[1]));
461 f128 cmp2 = F128::CmpEq(inf_value, F128::Abs(m.r[2]));
462 f128 cmp3 = F128::CmpEq(inf_value, F128::Abs(m.r[3]));
463 cmp0 = F128::Or(cmp0, cmp1);
464 cmp2 = F128::Or(cmp2, cmp3);
465 cmp0 = F128::Or(cmp0, cmp2);
466 return !F128::IsAllMaskFalse(cmp0);
472 NLIB_M(
bool) Matrix::IsNaN(SimdMatrixArg m) NLIB_NOEXCEPT {
473 f128 cmp0 = F128::IsNaN(m.r[0]);
474 f128 cmp1 = F128::IsNaN(m.r[1]);
475 f128 cmp2 = F128::IsNaN(m.r[2]);
476 f128 cmp3 = F128::IsNaN(m.r[3]);
477 cmp0 = F128::Or(cmp0, cmp1);
478 cmp2 = F128::Or(cmp2, cmp3);
479 cmp0 = F128::Or(cmp0, cmp2);
480 return !F128::IsAllMaskFalse(cmp0);
485 NLIB_M(SimdMatrix) Matrix::Mult(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT {
487 m.r[0] = Vector4::Transform(a.r[0], b);
488 m.r[1] = Vector4::Transform(a.r[1], b);
489 m.r[2] = Vector4::Transform(a.r[2], b);
490 m.r[3] = Vector4::Transform(a.r[3], b);
496 NLIB_M(SimdMatrix) Matrix::MultTranspose(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT {
497 f128 r0 = Vector4::Transform(a.r[0], b);
498 f128 r1 = Vector4::Transform(a.r[1], b);
499 f128 r2 = Vector4::Transform(a.r[2], b);
500 f128 r3 = Vector4::Transform(a.r[3], b);
502 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
503 float32x4x2_t trn_f0 = vtrnq_f32(r0, r1);
504 float32x4x2_t trn_f1 = vtrnq_f32(r2, r3);
505 ret.r[0] = vcombine_f32(vget_low_f32(trn_f0.val[0]), vget_low_f32(trn_f1.val[0]));
506 ret.r[1] = vcombine_f32(vget_low_f32(trn_f0.val[1]), vget_low_f32(trn_f1.val[1]));
507 ret.r[2] = vcombine_f32(vget_high_f32(trn_f0.val[0]), vget_high_f32(trn_f1.val[0]));
508 ret.r[3] = vcombine_f32(vget_high_f32(trn_f0.val[1]), vget_high_f32(trn_f1.val[1]));
510 f128 tmp0 = F128::Permute<0, 1, 4, 5>(r0, r1);
511 f128 tmp2 = F128::Permute<2, 3, 6, 7>(r0, r1);
512 f128 tmp1 = F128::Permute<0, 1, 4, 5>(r2, r3);
513 f128 tmp3 = F128::Permute<2, 3, 6, 7>(r2, r3);
514 ret.r[0] = F128::Permute<0, 2, 4, 6>(tmp0, tmp1);
515 ret.r[1] = F128::Permute<1, 3, 5, 7>(tmp0, tmp1);
516 ret.r[2] = F128::Permute<0, 2, 4, 6>(tmp2, tmp3);
517 ret.r[3] = F128::Permute<1, 3, 5, 7>(tmp2, tmp3);
527 NLIB_M(SimdMatrix) Matrix::FromScaling(
float scale_x,
float scale_y,
float scale_z) NLIB_NOEXCEPT {
529 f128 zero = F128::SetZero();
530 m.r[0] = F128::SetFloatToLane<0>(zero, scale_x);
531 m.r[1] = F128::SetFloatToLane<1>(zero, scale_y);
532 m.r[2] = F128::SetFloatToLane<2>(zero, scale_z);
533 m.r[3] = F128::LoadA16(F128::v0001_);
542 NLIB_M(SimdMatrix) Matrix::FromScaling(
SimdVectorArg scale) NLIB_NOEXCEPT {
544 f128 zero = F128::SetZero();
545 m.r[0] = F128::Splat<false, true, true, true>(scale, zero);
546 m.r[1] = F128::Splat<true, false, true, true>(scale, zero);
547 m.r[2] = F128::Splat<true, true, false, true>(scale, zero);
548 m.r[3] = F128::LoadA16(F128::v0001_);
557 NLIB_M(SimdMatrix) Matrix::FromTranslation(
float ofs_x,
float ofs_y,
float ofs_z) NLIB_NOEXCEPT {
559 m.r[0] = F128::LoadA16(F128::v1000_);
560 m.r[1] = F128::LoadA16(F128::v0100_);
561 m.r[2] = F128::LoadA16(F128::v0010_);
562 m.r[3] = F128::SetValue(ofs_x, ofs_y, ofs_z, 1.f);
571 NLIB_M(SimdMatrix) Matrix::FromTranslation(
SimdVectorArg ofs) NLIB_NOEXCEPT {
573 m.r[0] = F128::LoadA16(F128::v1000_);
574 m.r[1] = F128::LoadA16(F128::v0100_);
575 m.r[2] = F128::LoadA16(F128::v0010_);
576 m.r[3] = F128::SetFloatToLane<3>(ofs, 1.f);
585 NLIB_M(SimdMatrix) Matrix::FromRotationX(
float sin_value,
float cos_value) NLIB_NOEXCEPT {
588 f128 r1 = F128::SetFloatToLane<1>(zero, cos_value);
589 r1 = F128::SetFloatToLane<2>(r1, sin_value);
590 f128 r2 = F128::SetFloatToLane<1>(zero, -sin_value);
591 r2 = F128::SetFloatToLane<2>(r2, cos_value);
593 m.r[0] = F128::LoadA16(F128::v1000_);
596 m.r[3] = F128::LoadA16(F128::v0001_);
605 NLIB_M(SimdMatrix) Matrix::FromRotationY(
float sin_value,
float cos_value) NLIB_NOEXCEPT {
608 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
609 r0 = F128::SetFloatToLane<2>(r0, -sin_value);
610 f128 r2 = F128::SetFloatToLane<0>(zero, sin_value);
611 r2 = F128::SetFloatToLane<2>(r2, cos_value);
614 m.r[1] = F128::LoadA16(F128::v0100_);
616 m.r[3] = F128::LoadA16(F128::v0001_);
625 NLIB_M(SimdMatrix) Matrix::FromRotationZ(
float sin_value,
float cos_value) NLIB_NOEXCEPT {
628 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
629 r0 = F128::SetFloatToLane<1>(r0, sin_value);
630 f128 r1 = F128::SetFloatToLane<0>(zero, -sin_value);
631 r1 = F128::SetFloatToLane<1>(r1, cos_value);
635 m.r[2] = F128::LoadA16(F128::v0010_);
636 m.r[3] = F128::LoadA16(F128::v0001_);
643 NLIB_M(SimdMatrix) Matrix::FromRotationAxisAndSinCos(
SimdVectorArg axis_normalized,
float sin_value,
644 float cos_value) NLIB_NOEXCEPT {
648 f128 nn = F128::Mult(axis_normalized, axis_normalized);
650 c1 = F128::SetValue(1.f - cos_value,
each_float);
651 diagonal = F128::MultAdd(c1, nn, c);
652 diagonal = F128::SetZeroToLane<3>(diagonal);
655 f128 zxy = F128::Swizzle<2, 0, 1, 2>(axis_normalized);
657 f128 xy_yz_xz = F128::Mult(axis_normalized, F128::Swizzle<1, 2, 0, 3>(axis_normalized));
658 xy_yz_xz = F128::Mult(c1, xy_yz_xz);
659 f128 plus = F128::MultAdd(s, zxy, xy_yz_xz);
660 f128 minus = F128::MultSub(s, zxy, xy_yz_xz);
662 f128 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
663 f128 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
666 m.r[0] = F128::Permute<4, 0, 1, 7>(t1, diagonal);
667 m.r[1] = F128::Permute<2, 5, 0, 7>(t0, diagonal);
668 m.r[2] = F128::Permute<1, 3, 6, 7>(t0, diagonal);
669 m.r[3] = F128::LoadA16(F128::v0001_);
675 NLIB_M(SimdMatrix) Matrix::FromRotationQuaternion(
SimdQuaternionArg quat) NLIB_NOEXCEPT {
677 f128 q2 = F128::Add(quat, quat);
678 f128 qq2 = F128::Mult(quat, q2);
681 t0 = F128::Swizzle<1, 0, 0, 1>(qq2);
682 t1 = F128::Swizzle<2, 2, 1, 0>(qq2);
684 f128 diagonal = F128::Sub(F128::Sub(F128::SetOne(), t0), t1);
685 diagonal = F128::SetFloatToLane<3>(diagonal, 0.f);
687 t0 = F128::Swizzle<1, 0, 0, 1>(quat);
688 t1 = F128::Swizzle<2, 2, 1, 0>(q2);
689 f128 yz_xz_xy = F128::Mult(t0, t1);
692 f128 wx_wy_wz = F128::Mult(q2, t0);
694 f128 plus = F128::Add(yz_xz_xy, wx_wy_wz);
695 f128 minus = F128::Sub(yz_xz_xy, wx_wy_wz);
697 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
698 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
701 m.r[0] = F128::Permute<4, 1, 3, 7>(t0, diagonal);
702 m.r[1] = F128::Permute<1, 5, 0, 7>(t1, diagonal);
703 m.r[2] = F128::Permute<0, 2, 6, 7>(t0, diagonal);
704 m.r[3] = F128::LoadA16(F128::v0001_);
709 NLIB_M(SimdMatrix) Matrix::FromRotationZXY(
SimdVectorArg sin_xyz,
716 f128 sz_cz_sz_cz = F128::Permute<2, 6, 2, 6>(sin_xyz, cos_xyz);
717 f128 sy_cy_cy_sy = F128::Permute<1, 5, 5, 1>(sin_xyz, cos_xyz);
718 f128 tmp = F128::Mult(sz_cz_sz_cz, sy_cy_cy_sy);
720 tmp = F128::Swizzle<1, 0, 3, 2>(tmp);
721 tmp = F128::NegateEx<false, false, true, true>(tmp);
722 m00_12_02_10 = F128::Add(tmp, m00_12_02_10);
726 f128 sy_sz_cy_cz = F128::Permute<1, 2, 5, 6>(sin_xyz, cos_xyz);
727 m20_01_22_11 = F128::Mult<0>(cos_xyz, sy_sz_cy_cz,
each_select32);
730 f128 r2 = F128::SetFloatToLane<3>(m20_01_22_11, 0.f);
731 f128 r1 = F128::Permute<3, 7, 1, 1>(m00_12_02_10, m20_01_22_11);
734 m.r[0] = F128::Permute<0, 5, 2, 7>(m00_12_02_10, r2);
735 m.r[1] = F128::SetZeroToLane<3>(r1);
736 m.r[2] = F128::SetFloatToLane<1>(r2, -F128::GetFloatFromLane<0>(sin_xyz));
737 m.r[3] = F128::LoadA16(F128::v0001_);
744 SimdVector r0 = Vector3::Cross(up_dir_normalized, eye_dir_normalized);
745 SimdVector r1 = Vector3::Cross(eye_dir_normalized, r0);
747 f128 d0 = Vector3::Dot(r0, neg);
748 f128 d1 = Vector3::Dot(r1, neg);
749 f128 d2 = Vector3::Dot(eye_dir_normalized, neg);
751 m.r[0] = F128::Splat<false, false, false, true>(r0, d0);
752 m.r[1] = F128::Splat<false, false, false, true>(r1, d1);
753 m.r[2] = F128::Splat<false, false, false, true>(eye_dir_normalized, d2);
754 m.r[3] = F128::LoadA16(F128::v0001_);
761 SimdVector eye_dir = F128::Sub(at_pos, eye_pos);
762 eye_dir = Vector3::Normalize(eye_dir);
763 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
769 return LookToLh(eye_pos, F128::Negate(eye_dir_normalized), up_dir_normalized);
775 SimdVector eye_dir = F128::Sub(eye_pos, at_pos);
776 eye_dir = Vector3::Normalize(eye_dir);
777 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
781 NLIB_M(SimdMatrix) Matrix::PerspectiveLh(
float width,
float height,
float near_z,
782 float far_z) NLIB_NOEXCEPT {
783 float near2 = near_z + near_z;
784 float range = far_z / (far_z - near_z);
785 f128 zero = F128::SetZero();
786 f128 v = F128::SetValue(near2 / width, near2 / height, range, -range * near_z);
788 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
789 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
790 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
791 m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
792 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
797 NLIB_M(SimdMatrix) Matrix::PerspectiveRh(
float width,
float height,
float near_z,
798 float far_z) NLIB_NOEXCEPT {
799 float near2 = near_z + near_z;
800 float range = far_z / (near_z - far_z);
801 f128 zero = F128::SetZero();
802 f128 v = F128::SetValue(near2 / width, near2 / height, range, range * near_z);
804 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
805 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
806 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
807 m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
808 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
813 NLIB_M(SimdMatrix) Matrix::PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
814 float near_z,
float far_z) NLIB_NOEXCEPT {
815 float height = half_fovy_cos / half_fovy_sin;
816 float width = height / aspect;
817 float range = far_z / (far_z - near_z);
819 f128 zero = F128::SetZero();
820 f128 v = F128::SetValue(width, height, range, -range * near_z);
822 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
823 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
824 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
825 m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
826 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
831 NLIB_M(SimdMatrix) Matrix::PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
832 float near_z,
float far_z) NLIB_NOEXCEPT {
833 float height = half_fovy_cos / half_fovy_sin;
834 float width = height / aspect;
835 float range = far_z / (near_z - far_z);
837 f128 zero = F128::SetZero();
838 f128 v = F128::SetValue(width, height, range, range * near_z);
840 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
841 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
842 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
843 m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
844 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
849 NLIB_M(SimdMatrix) Matrix::PerspectiveOffCenterLh(
float left,
float right,
float bottom,
float top,
850 float near_z,
float far_z) NLIB_NOEXCEPT {
851 float near2 = near_z + near_z;
854 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
855 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
856 div = F128::Div(a, b);
859 f128 zero = F128::SetZero();
860 f128 v0 = F128::SetValue(near2, near2, -near_z, 1.f);
861 f128 r2 = F128::SetValue(-(left + right), -(top + bottom), 1.f, 1.f);
862 v0 = F128::Mult(v0, div);
865 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
866 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
867 m.r[2] = F128::Mult(r2, div);
868 m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
873 NLIB_M(SimdMatrix) Matrix::PerspectiveOffCenterRh(
float left,
float right,
float bottom,
float top,
874 float near_z,
float far_z) NLIB_NOEXCEPT {
875 float near2 = near_z + near_z;
878 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
879 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, -1.f);
880 div = F128::Div(a, b);
883 f128 zero = F128::SetZero();
884 f128 v0 = F128::SetValue(near2, near2, near_z, 1.f);
885 f128 r2 = F128::SetValue((left + right), (top + bottom), 1.f, 1.f);
886 v0 = F128::Mult(v0, div);
889 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
890 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
891 m.r[2] = F128::Mult(r2, div);
892 m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
897 NLIB_M(SimdMatrix) Matrix::OrthographicLh(
float width,
float height,
float near_z,
898 float far_z) NLIB_NOEXCEPT {
901 f128 a = F128::SetValue(2.f, 2.f, 1.f, -near_z);
902 f128 b = F128::SetValue(width, height, far_z - near_z, far_z - near_z);
903 div = F128::Div(a, b);
905 f128 zero = F128::SetZero();
908 m.r[0] = F128::Splat<false, true, true, true>(div, zero);
909 m.r[1] = F128::Splat<true, false, true, true>(div, zero);
910 m.r[2] = F128::Splat<true, true, false, true>(div, zero);
911 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
912 m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
917 NLIB_M(SimdMatrix) Matrix::OrthographicRh(
float width,
float height,
float near_z,
918 float far_z) NLIB_NOEXCEPT {
921 f128 a = F128::SetValue(2.f, 2.f, 1.f, near_z);
922 f128 b = F128::SetValue(width, height, near_z - far_z, near_z - far_z);
923 div = F128::Div(a, b);
925 f128 zero = F128::SetZero();
928 m.r[0] = F128::Splat<false, true, true, true>(div, zero);
929 m.r[1] = F128::Splat<true, false, true, true>(div, zero);
930 m.r[2] = F128::Splat<true, true, false, true>(div, zero);
931 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
932 m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
937 NLIB_M(SimdMatrix) Matrix::OrthographicOffCenterLh(
float left,
float right,
float bottom,
float top,
938 float near_z,
float far_z) NLIB_NOEXCEPT {
941 f128 a = F128::SetOne();
942 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
943 div = F128::Div(a, b);
945 f128 zero = F128::SetZero();
946 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
947 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), -near_z, 1.f);
948 v0 = F128::Mult(v0, div);
951 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
952 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
953 m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
954 m.r[3] = F128::Mult(r3, div);
959 NLIB_M(SimdMatrix) Matrix::OrthographicOffCenterRh(
float left,
float right,
float bottom,
float top,
960 float near_z,
float far_z) NLIB_NOEXCEPT {
963 f128 a = F128::SetOne();
964 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, 1.f);
965 div = F128::Div(a, b);
967 f128 zero = F128::SetZero();
968 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
969 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), near_z, 1.f);
970 v0 = F128::Mult(v0, div);
973 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
974 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
975 m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
976 m.r[3] = F128::Mult(r3, div);
983 SimdPlane plane = F128::Mult(Vector3::RecpLength(shadow_plane), shadow_plane);
985 f128 r0 = Vector4::DotEx<true, false, false, false>(plane, light_pos);
986 plane = F128::Negate(plane);
987 f128 r1 = F128::RotateLeft<1>(r0);
988 f128 r2 = F128::RotateLeft<2>(r0);
989 f128 r3 = F128::RotateLeft<3>(r0);
992 m.r[0] = F128::MultAdd<0>(plane, light_pos, r0,
each_select32);
993 m.r[1] = F128::MultAdd<1>(plane, light_pos, r1,
each_select32);
994 m.r[2] = F128::MultAdd<2>(plane, light_pos, r2,
each_select32);
995 m.r[3] = F128::MultAdd<3>(plane, light_pos, r3,
each_select32);
1000 NLIB_M(SimdMatrix) Matrix::Reflect(
SimdPlaneArg reflection_plane) NLIB_NOEXCEPT {
1002 SimdPlane plane = F128::Mult(Vector3::RecpLength(reflection_plane), reflection_plane);
1003 f128 minus_2n = F128::Mult(-2.f, plane);
1004 minus_2n = F128::SetZeroToLane<3>(minus_2n);
1006 SimdMatrix m = Matrix::Identity();
1007 m.r[0] = F128::MultAdd<0>(plane, minus_2n, m.r[0],
each_select32);
1008 m.r[1] = F128::MultAdd<1>(plane, minus_2n, m.r[1],
each_select32);
1009 m.r[2] = F128::MultAdd<2>(plane, minus_2n, m.r[2],
each_select32);
1010 m.r[3] = F128::MultAdd<3>(plane, minus_2n, m.r[3],
each_select32);
1017 SimdMatrixArg m) NLIB_NOEXCEPT {
1024 f128 dot_x = Vector3::DotEx<true, false, false, true>(m.r[0], m.r[0]);
1025 f128 dot_y = Vector3::DotEx<false, true, false, true>(m.r[1], m.r[1]);
1026 f128 dot_z = Vector3::DotEx<false, false, true, true>(m.r[2], m.r[2]);
1027 f128 dot = F128::Or(dot_x, dot_y);
1028 dot = F128::Or(dot, dot_z);
1029 recp_scale = F128::RecpSqrt(dot);
1030 *scale = F128::Mult(dot, recp_scale);
1034 rot->r[0] = F128::Mult<0>(recp_scale, m.r[0],
each_select32);
1035 rot->r[1] = F128::Mult<1>(recp_scale, m.r[1],
each_select32);
1036 rot->r[2] = F128::Mult<2>(recp_scale, m.r[2],
each_select32);
1037 rot->r[3] = F128::LoadA16(F128::v0001_);
1042 #endif // NLIB_DOXYGEN
1047 #endif // INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
The class with the collection of functions that handle 4x4 matrices.
f128arg SimdVectorArg
f128arg is defined using typedef.
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
f128arg SimdQuaternionArg
f128arg is defined using typedef.
f128arg SimdPlaneArg
f128arg is defined using typedef.
f128 r[4]
Keeps each row of a 4x4 matrix.
The structure for keeping a 4x4 matrix.
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
#define NLIB_F128_TRANSPOSE(row0, row1, row2, row3)
A macro for in-place matrix transposition.
Defines a three-dimensional vector.
nlib_f128_t f128
nlib_f128_t is is defined using typedef.
Defines a four-dimensional vector.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...