3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ 4 #define INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ 37 static SimdMatrix __vectorcall FromScaling(
float scale_x,
float scale_y,
40 static SimdMatrix __vectorcall FromTranslation(
float ofs_x,
float ofs_y,
64 PerspectiveLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
66 PerspectiveRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
67 static SimdMatrix __vectorcall PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
68 float aspect,
float near_z,
70 static SimdMatrix __vectorcall PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
71 float aspect,
float near_z,
73 static SimdMatrix __vectorcall PerspectiveOffCenterLh(
float left,
float right,
float bottom,
74 float top,
float near_z,
76 static SimdMatrix __vectorcall PerspectiveOffCenterRh(
float left,
float right,
float bottom,
77 float top,
float near_z,
81 OrthographicLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
83 OrthographicRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
84 static SimdMatrix __vectorcall OrthographicOffCenterLh(
float left,
float right,
float bottom,
85 float top,
float near_z,
87 static SimdMatrix __vectorcall OrthographicOffCenterRh(
float left,
float right,
float bottom,
88 float top,
float near_z,
104 #define NLIB_M(tp) inline tp __vectorcall 108 m.
r[0] = F128::LoadA16(&p->
m[0][0]);
109 m.
r[1] = F128::LoadA16(&p->
m[1][0]);
110 m.
r[2] = F128::LoadA16(&p->
m[2][0]);
111 m.
r[3] = F128::LoadA16(&p->
m[3][0]);
118 m.
r[0] = F128::LoadA16(&p->
m[0][0]);
119 m.
r[1] = F128::LoadA16(&p->
m[1][0]);
120 m.
r[2] = F128::LoadA16(&p->
m[2][0]);
121 m.
r[3] = F128::Set0001();
127 f128 t0 = F128::LoadA16(&p->
m[0][0]);
128 f128 t1 = F128::LoadA16(&p->
m[1][1]);
129 f128 t2 = F128::LoadA16(&p->
m[2][2]);
131 m.
r[0] = F128::SetZeroToLane<3>(t0);
132 f128 tmp1 = F128::Permute<3, 4, 5, -1>(t0, t1);
133 m.
r[1] = F128::SetZeroToLane<3>(tmp1);
134 f128 tmp2 = F128::Permute<2, 3, 4, -1>(t1, t2);
135 m.
r[2] = F128::SetZeroToLane<3>(tmp2);
136 m.
r[3] = F128::Permute<1, 2, 3, 7>(t2, F128::SetOne());
141 f128 t0 = F128::LoadA4(&p->
m[0][0]);
142 f128 t1 = F128::LoadA4(&p->
m[1][0]);
143 f128 t2 = F128::LoadA4(&p->
m[1][2]);
144 f128 zero = F128::SetZero();
146 m.
r[0] = F128::SetZeroToLane<3>(t0);
147 m.
r[1] = F128::SetZeroToLane<3>(t1);
148 m.
r[2] = F128::Permute<1, 2, 3, 7>(t2, zero);
149 m.
r[3] = F128::Set0001();
154 F128::StoreA16(&p->
m[0][0], m.
r[0]);
155 F128::StoreA16(&p->
m[1][0], m.
r[1]);
156 F128::StoreA16(&p->
m[2][0], m.
r[2]);
157 F128::StoreA16(&p->
m[3][0], m.
r[3]);
161 f128 t0 = F128::Permute<0, 1, 2, 4>(m.
r[0], m.
r[1]);
162 f128 t1 = F128::Permute<1, 2, 4, 5>(m.
r[1], m.
r[2]);
163 f128 t2 = F128::Permute<2, 4, 5, 6>(m.
r[2], m.
r[3]);
164 F128::StoreA16(&p->
m[0][0], t0);
165 F128::StoreA16(&p->
m[1][1], t1);
166 F128::StoreA16(&p->
m[2][2], t2);
170 f128 t0 = F128::Permute<0, 1, 2, 4>(m.
r[0], m.
r[1]);
171 f128 t1 = F128::Permute<1, 2, 4, 5>(m.
r[1], m.
r[2]);
172 F128::StoreA4(&p->
m[0][0], t0);
173 F128::StoreA4(&p->
m[1][1], t1);
174 p->
m[2][2] = F128::GetFloatFromLane<2>(m.
r[2]);
192 f128 c0det, c1det, c2det;
194 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(m.
r[2]);
195 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(m.
r[3]);
197 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(m.
r[2]);
198 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(m.
r[3]);
200 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(m.
r[2]);
201 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(m.
r[3]);
203 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
204 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
205 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
207 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
208 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
209 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
212 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(m.
r[1]);
213 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(m.
r[1]);
214 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(m.
r[1]);
215 f128 r0x = F128::NegateEx<true, false, true, false>(m.
r[0]);
217 f128 det3_neg = F128::Mult(c1det, ccbb_1);
218 det3_neg = F128::MultSub(c0det, baaa_1, det3_neg);
219 det3_neg = F128::MultSub(c2det, dddc_1, det3_neg);
220 return Vector4::Dot(r0x, det3_neg);
230 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 231 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
232 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
233 float32x2_t x00 = vcreate_f32(0ULL);
234 m.
r[0] = vcombine_f32(x10, x00);
235 m.
r[1] = vcombine_f32(x01, x00);
236 m.
r[2] = vcombine_f32(x00, x10);
237 m.
r[3] = vcombine_f32(x00, x01);
239 m.
r[0] = F128::LoadA16(F128::v1000_);
240 m.
r[1] = F128::LoadA16(F128::v0100_);
241 m.
r[2] = F128::LoadA16(F128::v0010_);
242 m.
r[3] = F128::LoadA16(F128::v0001_);
265 F128::StoreA16(&p->
m[0][0], M.
r[0]);
266 F128::StoreA16(&p->
m[1][0], M.
r[1]);
267 F128::StoreA16(&p->
m[2][0], M.
r[2]);
298 f128 detValueReciprocal;
302 f128 c0det, c1det, c2det;
304 f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.
r[2], m.
r[1]);
305 f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.
r[2], m.
r[1]);
307 f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.
r[3], m.
r[2]);
308 f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.
r[3], m.
r[2]);
310 f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.
r[1], m.
r[0]);
311 f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.
r[1], m.
r[0]);
313 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
314 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
315 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
317 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
318 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
319 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
322 f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.
r[1], m.
r[0]);
323 f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.
r[2], m.
r[1]);
324 f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.
r[3], m.
r[2]);
326 f128 r0x_m = F128::Permute<0, -1, 4, -1>(m.
r[0], m.
r[2]);
327 f128 r0x_p = F128::Permute<-1, 0, -1, 4>(m.
r[1], m.
r[3]);
328 f128 r0x = F128::Permute<0, 5, 2, 7>(F128::Negate(r0x_m), r0x_p);
330 f128 det3 = F128::Mult(c1det, ccbb_1);
331 det3 = F128::MultSub(c0det, baaa_1, det3);
332 det3 = F128::MultSub(c2det, dddc_1, det3);
334 mydet = Vector4::Dot(r0x, det3);
336 det3 = F128::NegateEx<true, false, true, false>(det3);
337 detValueReciprocal = F128::Recp(mydet);
339 ret.
r[0] = F128::Mult(detValueReciprocal, det3);
342 f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.
r[1], m.
r[0]);
343 f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.
r[2], m.
r[1]);
344 f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.
r[3], m.
r[2]);
346 f128 det3 = F128::Mult(c0det, baaa_0);
347 det3 = F128::MultAdd(c2det, dddc_0, det3);
348 det3 = F128::MultSub(c1det, ccbb_0, det3);
349 det3 = F128::NegateEx<true, false, true, false>(det3);
350 ret.
r[1] = F128::Mult(detValueReciprocal, det3);
366 f128 c0det, c1det, c2det;
368 f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.
r[2], m.
r[1]);
369 f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.
r[2], m.
r[1]);
371 f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.
r[3], m.
r[2]);
372 f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.
r[3], m.
r[2]);
374 f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.
r[1], m.
r[0]);
375 f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.
r[1], m.
r[0]);
377 f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
378 f128 tmp1 = F128::Mult(baaa_0, dddc_1);
379 f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
381 c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
382 c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
383 c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
386 f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.
r[1], m.
r[0]);
387 f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.
r[2], m.
r[1]);
388 f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.
r[3], m.
r[2]);
390 f128 det3 = F128::Mult(c1det, ccbb_3);
391 det3 = F128::MultSub(c0det, baaa_3, det3);
392 det3 = F128::MultSub(c2det, dddc_3, det3);
393 det3 = F128::NegateEx<true, false, true, false>(det3);
395 ret.
r[2] = F128::Mult(detValueReciprocal, det3);
398 f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.
r[1], m.
r[0]);
399 f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.
r[2], m.
r[1]);
400 f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.
r[3], m.
r[2]);
402 f128 det3 = F128::Mult(c0det, baaa_2);
403 det3 = F128::MultAdd(c2det, dddc_2, det3);
404 det3 = F128::MultSub(c1det, ccbb_2, det3);
405 det3 = F128::NegateEx<true, false, true, false>(det3);
407 ret.
r[3] = F128::Mult(detValueReciprocal, det3);
442 f128 detValueReciprocal;
445 f128 c0det, c1det, c2det;
447 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.
r[2]);
448 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.
r[3]);
450 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.
r[2]);
451 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.
r[3]);
453 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.
r[2]);
454 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.
r[3]);
456 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
457 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
458 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
460 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
461 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
462 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
465 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.
r[1]);
466 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.
r[1]);
467 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.
r[1]);
468 f128 r0x = F128::NegateEx<true, false, true, false>(M.
r[0]);
470 f128 det3 = F128::Mult(c1det, ccbb_1);
471 det3 = F128::MultSub(c0det, baaa_1, det3);
472 det3 = F128::MultSub(c2det, dddc_1, det3);
474 detValueReciprocal = Vector4::Dot(r0x, det3);
476 *det = detValueReciprocal;
479 det3 = F128::NegateEx<true, false, true, false>(det3);
480 detValueReciprocal = F128::Recp(detValueReciprocal);
482 ret.
r[0] = F128::Mult(detValueReciprocal, det3);
485 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.
r[0]);
486 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.
r[0]);
487 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.
r[0]);
489 f128 det3 = F128::Mult(c0det, baaa_0);
490 det3 = F128::MultAdd(c2det, dddc_0, det3);
491 det3 = F128::MultSub(c1det, ccbb_0, det3);
492 det3 = F128::NegateEx<true, false, true, false>(det3);
493 ret.
r[1] = F128::Mult(detValueReciprocal, det3);
509 f128 c0det, c1det, c2det;
511 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.
r[0]);
512 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.
r[1]);
514 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.
r[0]);
515 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.
r[1]);
517 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.
r[0]);
518 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.
r[1]);
520 f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
521 f128 tmp1 = F128::Mult(baaa_0, dddc_1);
522 f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
524 c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
525 c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
526 c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
529 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.
r[3]);
530 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.
r[3]);
531 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.
r[3]);
533 f128 det3 = F128::Mult(c1det, ccbb_3);
534 det3 = F128::MultSub(c0det, baaa_3, det3);
535 det3 = F128::MultSub(c2det, dddc_3, det3);
536 det3 = F128::NegateEx<true, false, true, false>(det3);
538 ret.
r[2] = F128::Mult(detValueReciprocal, det3);
541 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.
r[2]);
542 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.
r[2]);
543 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.
r[2]);
545 f128 det3 = F128::Mult(c0det, baaa_2);
546 det3 = F128::MultAdd(c2det, dddc_2, det3);
547 det3 = F128::MultSub(c1det, ccbb_2, det3);
548 det3 = F128::NegateEx<true, false, true, false>(det3);
550 ret.
r[3] = F128::Mult(detValueReciprocal, det3);
561 f128 cmp0 = F128::CmpEq(x.
r[0], m.
r[0]);
562 f128 cmp1 = F128::CmpEq(x.
r[1], m.
r[1]);
563 f128 cmp2 = F128::CmpEq(x.
r[2], m.
r[2]);
564 f128 cmp3 = F128::CmpEq(x.
r[3], m.
r[3]);
565 cmp0 = F128::And(cmp0, cmp1);
566 cmp2 = F128::And(cmp2, cmp3);
567 cmp0 = F128::And(cmp0, cmp2);
568 return F128::IsAllMaskTrue(cmp0);
574 #ifdef NLIB_F128_SIMD_NOUSE 575 f128 cmp0 = F128::IsInfinite(m.
r[0]);
576 f128 cmp1 = F128::IsInfinite(m.
r[1]);
577 f128 cmp2 = F128::IsInfinite(m.
r[2]);
578 f128 cmp3 = F128::IsInfinite(m.
r[3]);
579 cmp0 = F128::Or(cmp0, cmp1);
580 cmp2 = F128::Or(cmp2, cmp3);
581 cmp0 = F128::Or(cmp0, cmp2);
582 return !F128::IsAllMaskFalse(cmp0);
584 f128 inf_value = F128::SetInfinity();
585 f128 cmp0 = F128::CmpEq(inf_value, F128::Abs(m.
r[0]));
586 f128 cmp1 = F128::CmpEq(inf_value, F128::Abs(m.
r[1]));
587 f128 cmp2 = F128::CmpEq(inf_value, F128::Abs(m.
r[2]));
588 f128 cmp3 = F128::CmpEq(inf_value, F128::Abs(m.
r[3]));
589 cmp0 = F128::Or(cmp0, cmp1);
590 cmp2 = F128::Or(cmp2, cmp3);
591 cmp0 = F128::Or(cmp0, cmp2);
592 return !F128::IsAllMaskFalse(cmp0);
599 f128 cmp0 = F128::IsNaN(m.
r[0]);
600 f128 cmp1 = F128::IsNaN(m.
r[1]);
601 f128 cmp2 = F128::IsNaN(m.
r[2]);
602 f128 cmp3 = F128::IsNaN(m.
r[3]);
603 cmp0 = F128::Or(cmp0, cmp1);
604 cmp2 = F128::Or(cmp2, cmp3);
605 cmp0 = F128::Or(cmp0, cmp2);
606 return !F128::IsAllMaskFalse(cmp0);
613 m.
r[0] = Vector4::Transform(a.
r[0], b);
614 m.
r[1] = Vector4::Transform(a.
r[1], b);
615 m.
r[2] = Vector4::Transform(a.
r[2], b);
616 m.
r[3] = Vector4::Transform(a.
r[3], b);
623 f128 r0 = Vector4::Transform(a.
r[0], b);
624 f128 r1 = Vector4::Transform(a.
r[1], b);
625 f128 r2 = Vector4::Transform(a.
r[2], b);
626 f128 r3 = Vector4::Transform(a.
r[3], b);
643 f128 zero = F128::SetZero();
644 m.
r[0] = F128::SetFloatToLane<0>(zero, scale_x);
645 m.
r[1] = F128::SetFloatToLane<1>(zero, scale_y);
646 m.
r[2] = F128::SetFloatToLane<2>(zero, scale_z);
647 m.
r[3] = F128::Set0001();
658 f128 zero = F128::SetZero();
659 m.
r[0] = F128::Splat<false, true, true, true>(scale, zero);
660 m.
r[1] = F128::Splat<true, false, true, true>(scale, zero);
661 m.
r[2] = F128::Splat<true, true, false, true>(scale, zero);
662 m.
r[3] = F128::Set0001();
673 m.
r[0] = F128::Set1000();
674 m.
r[1] = F128::Set0100();
675 m.
r[2] = F128::Set0010();
676 m.
r[3] = F128::SetValue(ofs_x, ofs_y, ofs_z, 1.f);
687 m.
r[0] = F128::Set1000();
688 m.
r[1] = F128::Set0100();
689 m.
r[2] = F128::Set0010();
690 m.
r[3] = F128::Permute<0, 1, 2, 4>(ofs, m.
r[0]);
702 f128 r1 = F128::SetFloatToLane<1>(zero, cos_value);
703 r1 = F128::SetFloatToLane<2>(r1, sin_value);
704 f128 r2 = F128::SetFloatToLane<1>(zero, -sin_value);
705 r2 = F128::SetFloatToLane<2>(r2, cos_value);
707 m.
r[0] = F128::Set1000();
710 m.
r[3] = F128::Set0001();
722 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
723 r0 = F128::SetFloatToLane<2>(r0, -sin_value);
724 f128 r2 = F128::SetFloatToLane<0>(zero, sin_value);
725 r2 = F128::SetFloatToLane<2>(r2, cos_value);
728 m.
r[1] = F128::Set0100();
730 m.
r[3] = F128::Set0001();
742 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
743 r0 = F128::SetFloatToLane<1>(r0, sin_value);
744 f128 r1 = F128::SetFloatToLane<0>(zero, -sin_value);
745 r1 = F128::SetFloatToLane<1>(r1, cos_value);
749 m.
r[2] = F128::Set0010();
750 m.
r[3] = F128::Set0001();
762 f128 nn = F128::Mult(axis_normalized, axis_normalized);
764 c1 = F128::SetValue(1.f - cos_value,
each_float);
765 diagonal = F128::MultAdd(c1, nn, c);
766 diagonal = F128::SetZeroToLane<3>(diagonal);
769 f128 zxy = F128::Swizzle<2, 0, 1, 2>(axis_normalized);
771 f128 xy_yz_xz = F128::Mult(axis_normalized, F128::Swizzle<1, 2, 0, 3>(axis_normalized));
772 xy_yz_xz = F128::Mult(c1, xy_yz_xz);
773 f128 plus = F128::MultAdd(s, zxy, xy_yz_xz);
774 f128 minus = F128::MultSub(s, zxy, xy_yz_xz);
776 f128 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
777 f128 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
780 m.
r[0] = F128::Permute<4, 0, 1, 7>(t1, diagonal);
781 m.r[1] = F128::Permute<2, 5, 0, 7>(t0, diagonal);
782 m.r[2] = F128::Permute<1, 3, 6, 7>(t0, diagonal);
783 m.r[3] = F128::Set0001();
791 f128 q2 = F128::Add(quat, quat);
792 f128 qq2 = F128::Mult(quat, q2);
795 t0 = F128::Swizzle<1, 0, 0, -1>(qq2);
796 t1 = F128::Swizzle<2, 2, 1, -1>(qq2);
798 f128 diagonal = F128::Sub(F128::Sub(F128::SetOne(), t0), t1);
799 diagonal = F128::SetFloatToLane<3>(diagonal, 0.f);
801 t0 = F128::Swizzle<1, 0, 0, -1>(quat);
802 t1 = F128::Swizzle<2, 2, 1, -1>(q2);
803 f128 yz_xz_xy = F128::Mult(t0, t1);
806 f128 wx_wy_wz = F128::Mult(q2, t0);
808 f128 plus = F128::Add(yz_xz_xy, wx_wy_wz);
809 f128 minus = F128::Sub(yz_xz_xy, wx_wy_wz);
811 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
812 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
815 m.
r[0] = F128::Permute<4, 1, 3, 7>(t0, diagonal);
816 m.r[1] = F128::Permute<1, 5, 0, 7>(t1, diagonal);
817 m.r[2] = F128::Permute<0, 2, 6, 7>(t0, diagonal);
818 m.r[3] = F128::Set0001();
830 f128 sz_cz_sz_cz = F128::Permute<2, 6, 2, 6>(sin_xyz, cos_xyz);
831 f128 sy_cy_cy_sy = F128::Permute<1, 5, 5, 1>(sin_xyz, cos_xyz);
832 f128 tmp = F128::Mult(sz_cz_sz_cz, sy_cy_cy_sy);
834 tmp = F128::Swizzle<1, 0, 3, 2>(tmp);
835 tmp = F128::NegateEx<false, false, true, true>(tmp);
836 m00_12_02_10 = F128::Add(tmp, m00_12_02_10);
840 f128 sy_sz_cy_cz = F128::Permute<1, 2, 5, 6>(sin_xyz, cos_xyz);
841 m20_01_22_11 = F128::Mult<0>(cos_xyz, sy_sz_cy_cz,
each_select32);
844 f128 r2 = F128::SetFloatToLane<3>(m20_01_22_11, 0.f);
845 f128 r1 = F128::Permute<3, 7, 1, 1>(m00_12_02_10, m20_01_22_11);
848 m.
r[0] = F128::Permute<0, 5, 2, 7>(m00_12_02_10, r2);
849 m.r[1] = F128::SetZeroToLane<3>(r1);
850 m.r[2] = F128::SetFloatToLane<1>(r2, -F128::GetFloatFromLane<0>(sin_xyz));
851 m.r[3] = F128::Set0001();
858 SimdVector r0 = Vector3::Cross(up_dir_normalized, eye_dir_normalized);
859 SimdVector r1 = Vector3::Cross(eye_dir_normalized, r0);
862 neg = F128::SetZeroToLane<3>(neg);
863 f128 d012 = Vector4::Dot3(neg, r0, r1, eye_dir_normalized);
867 m.
r[2] = eye_dir_normalized;
868 m.
r[3] = F128::Set0001();
870 m.
r[3] = F128::SetFloatToLane<3>(d012, 1.f);
873 f128 d0 = Vector3::Dot(r0, neg);
874 f128 d1 = Vector3::Dot(r1, neg);
875 f128 d2 = Vector3::Dot(eye_dir_normalized, neg);
877 m.
r[0] = F128::Splat<false, false, false, true>(r0, d0);
878 m.
r[1] = F128::Splat<false, false, false, true>(r1, d1);
879 m.
r[2] = F128::Splat<false, false, false, true>(eye_dir_normalized, d2);
880 m.
r[3] = F128::Set0001();
888 SimdVector eye_dir = F128::Sub(at_pos, eye_pos);
889 eye_dir = Vector3::Normalize(eye_dir);
890 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
896 return LookToLh(eye_pos, F128::Negate(eye_dir_normalized), up_dir_normalized);
902 SimdVector eye_dir = F128::Sub(eye_pos, at_pos);
903 eye_dir = Vector3::Normalize(eye_dir);
904 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
908 NLIB_M(
SimdMatrix) Matrix::PerspectiveLh(
float width,
float height,
float near_z,
910 float near2 = near_z + near_z;
911 float range = far_z / (far_z - near_z);
912 f128 zero = F128::SetZero();
913 f128 v = F128::SetValue(near2 / width, near2 / height, range, -range * near_z);
915 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
916 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
917 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
918 m.
r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
919 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
924 NLIB_M(
SimdMatrix) Matrix::PerspectiveRh(
float width,
float height,
float near_z,
926 float near2 = near_z + near_z;
927 float range = far_z / (near_z - far_z);
928 f128 zero = F128::SetZero();
929 f128 v = F128::SetValue(near2 / width, near2 / height, range, range * near_z);
931 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
932 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
933 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
934 m.
r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
935 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
940 NLIB_M(
SimdMatrix) Matrix::PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
942 float height = half_fovy_cos / half_fovy_sin;
943 float width = height / aspect;
944 float range = far_z / (far_z - near_z);
946 f128 zero = F128::SetZero();
947 f128 v = F128::SetValue(width, height, range, -range * near_z);
949 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
950 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
951 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
952 m.
r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
953 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
958 NLIB_M(
SimdMatrix) Matrix::PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
960 float height = half_fovy_cos / half_fovy_sin;
961 float width = height / aspect;
962 float range = far_z / (near_z - far_z);
964 f128 zero = F128::SetZero();
965 f128 v = F128::SetValue(width, height, range, range * near_z);
967 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
968 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
969 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
970 m.
r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
971 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
976 NLIB_M(
SimdMatrix) Matrix::PerspectiveOffCenterLh(
float left,
float right,
float bottom,
float top,
978 float near2 = near_z + near_z;
981 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
982 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
983 div = F128::Div(a, b);
986 f128 zero = F128::SetZero();
987 f128 v0 = F128::SetValue(near2, near2, -near_z, 1.f);
988 f128 r2 = F128::SetValue(-(left + right), -(top + bottom), 1.f, 1.f);
989 v0 = F128::Mult(v0, div);
992 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
993 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
994 m.
r[2] = F128::Mult(r2, div);
995 m.
r[3] = F128::Splat<true, true, false, true>(v0, zero);
1000 NLIB_M(
SimdMatrix) Matrix::PerspectiveOffCenterRh(
float left,
float right,
float bottom,
float top,
1002 float near2 = near_z + near_z;
1005 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
1006 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, -1.f);
1007 div = F128::Div(a, b);
1010 f128 zero = F128::SetZero();
1011 f128 v0 = F128::SetValue(near2, near2, near_z, 1.f);
1012 f128 r2 = F128::SetValue((left + right), (top + bottom), 1.f, 1.f);
1013 v0 = F128::Mult(v0, div);
1016 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
1017 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
1018 m.
r[2] = F128::Mult(r2, div);
1019 m.
r[3] = F128::Splat<true, true, false, true>(v0, zero);
1024 NLIB_M(
SimdMatrix) Matrix::OrthographicLh(
float width,
float height,
float near_z,
1028 f128 a = F128::SetValue(2.f, 2.f, 1.f, -near_z);
1029 f128 b = F128::SetValue(width, height, far_z - near_z, far_z - near_z);
1030 div = F128::Div(a, b);
1032 f128 zero = F128::SetZero();
1035 m.
r[0] = F128::Splat<false, true, true, true>(div, zero);
1036 m.
r[1] = F128::Splat<true, false, true, true>(div, zero);
1037 m.
r[2] = F128::Splat<true, true, false, true>(div, zero);
1038 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1039 m.
r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1044 NLIB_M(
SimdMatrix) Matrix::OrthographicRh(
float width,
float height,
float near_z,
1048 f128 a = F128::SetValue(2.f, 2.f, 1.f, near_z);
1049 f128 b = F128::SetValue(width, height, near_z - far_z, near_z - far_z);
1050 div = F128::Div(a, b);
1052 f128 zero = F128::SetZero();
1055 m.
r[0] = F128::Splat<false, true, true, true>(div, zero);
1056 m.
r[1] = F128::Splat<true, false, true, true>(div, zero);
1057 m.
r[2] = F128::Splat<true, true, false, true>(div, zero);
1058 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1059 m.
r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1064 NLIB_M(
SimdMatrix) Matrix::OrthographicOffCenterLh(
float left,
float right,
float bottom,
float top,
1068 f128 a = F128::SetOne();
1069 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
1070 div = F128::Div(a, b);
1072 f128 zero = F128::SetZero();
1073 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1074 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), -near_z, 1.f);
1075 v0 = F128::Mult(v0, div);
1078 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
1079 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
1080 m.
r[2] = F128::Splat<true, true, false, true>(v0, zero);
1081 m.
r[3] = F128::Mult(r3, div);
1086 NLIB_M(
SimdMatrix) Matrix::OrthographicOffCenterRh(
float left,
float right,
float bottom,
float top,
1090 f128 a = F128::SetOne();
1091 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, 1.f);
1092 div = F128::Div(a, b);
1094 f128 zero = F128::SetZero();
1095 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1096 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), near_z, 1.f);
1097 v0 = F128::Mult(v0, div);
1100 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
1101 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
1102 m.
r[2] = F128::Splat<true, true, false, true>(v0, zero);
1103 m.
r[3] = F128::Mult(r3, div);
1110 SimdPlane plane = F128::Mult(Vector3::RecpLength(shadow_plane), shadow_plane);
1112 f128 r0 = Vector4::DotEx<true, false, false, false>(plane, light_pos);
1113 plane = F128::Negate(plane);
1114 f128 r1 = F128::RotateLeft<1>(r0);
1115 f128 r2 = F128::RotateLeft<2>(r0);
1116 f128 r3 = F128::RotateLeft<3>(r0);
1120 m.r[1] = F128::MultAdd<1>(plane, light_pos, r1,
each_select32);
1121 m.r[2] = F128::MultAdd<2>(plane, light_pos, r2,
each_select32);
1122 m.r[3] = F128::MultAdd<3>(plane, light_pos, r3,
each_select32);
1129 SimdPlane plane = F128::Mult(Vector3::RecpLength(reflection_plane), reflection_plane);
1130 f128 minus_2n = F128::Mult(-2.f, plane);
1131 minus_2n = F128::SetZeroToLane<3>(minus_2n);
1134 m.r[0] = F128::MultAdd<0>(plane, minus_2n, m.r[0],
each_select32);
1135 m.r[1] = F128::MultAdd<1>(plane, minus_2n, m.r[1],
each_select32);
1136 m.r[2] = F128::MultAdd<2>(plane, minus_2n, m.r[2],
each_select32);
1137 m.r[3] = F128::MultAdd<3>(plane, minus_2n, m.r[3],
each_select32);
1151 f128 dot_x = Vector3::DotEx<true, false, false, true>(m.
r[0], m.
r[0]);
1152 f128 dot_y = Vector3::DotEx<false, true, false, true>(m.
r[1], m.
r[1]);
1153 f128 dot_z = Vector3::DotEx<false, false, true, true>(m.
r[2], m.
r[2]);
1154 f128 dot = F128::Or(dot_x, dot_y);
1155 dot = F128::Or(dot, dot_z);
1156 recp_scale = F128::RecpSqrt(dot);
1157 *scale = F128::Mult(dot, recp_scale);
1164 rot->
r[3] = F128::Set0001();
1169 #endif // NLIB_DOXYGEN 1174 #endif // INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ The class with the collection of functions that handle 4x4 matrices.
float m[3][4]
A 2D 3x4 array.
f128arg SimdVectorArg
f128arg is defined using typedef.
float m[4][3]
A 2D 4x3 array.
float m[4][4]
A 2D 4x4 array.
#define NLIB_F128_TRANSPOSE(row0, row1, row2, row3)
A macro for in-place matrix transposition.
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
f128arg SimdQuaternionArg
f128arg is defined using typedef.
f128arg SimdPlaneArg
f128arg is defined using typedef.
f128 r[4]
Keeps each row of a 4x4 matrix.
The structure for keeping a 4x4 matrix.
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
Defines a three-dimensional vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
Defines a four-dimensional vector.
float m[3][3]
A 2D 3x3 array.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...