16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ 50 static SimdMatrix __vectorcall FromScaling(
float scale_x,
float scale_y,
53 static SimdMatrix __vectorcall FromTranslation(
float ofs_x,
float ofs_y,
77 PerspectiveLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
79 PerspectiveRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
80 static SimdMatrix __vectorcall PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
81 float aspect,
float near_z,
83 static SimdMatrix __vectorcall PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
84 float aspect,
float near_z,
86 static SimdMatrix __vectorcall PerspectiveOffCenterLh(
float left,
float right,
float bottom,
87 float top,
float near_z,
89 static SimdMatrix __vectorcall PerspectiveOffCenterRh(
float left,
float right,
float bottom,
90 float top,
float near_z,
94 OrthographicLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
96 OrthographicRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT;
97 static SimdMatrix __vectorcall OrthographicOffCenterLh(
float left,
float right,
float bottom,
98 float top,
float near_z,
100 static SimdMatrix __vectorcall OrthographicOffCenterRh(
float left,
float right,
float bottom,
101 float top,
float near_z,
117 #define NLIB_M(tp) inline tp __vectorcall 121 m.
r[0] = F128::LoadA16(&p->
m[0][0]);
122 m.
r[1] = F128::LoadA16(&p->
m[1][0]);
123 m.
r[2] = F128::LoadA16(&p->
m[2][0]);
124 m.
r[3] = F128::LoadA16(&p->
m[3][0]);
131 m.
r[0] = F128::LoadA16(&p->
m[0][0]);
132 m.
r[1] = F128::LoadA16(&p->
m[1][0]);
133 m.
r[2] = F128::LoadA16(&p->
m[2][0]);
134 m.
r[3] = F128::Set0001();
140 f128 t0 = F128::LoadA16(&p->
m[0][0]);
141 f128 t1 = F128::LoadA16(&p->
m[1][1]);
142 f128 t2 = F128::LoadA16(&p->
m[2][2]);
144 m.
r[0] = F128::SetZeroToLane<3>(t0);
145 f128 tmp1 = F128::Permute<3, 4, 5, -1>(t0, t1);
146 m.
r[1] = F128::SetZeroToLane<3>(tmp1);
147 f128 tmp2 = F128::Permute<2, 3, 4, -1>(t1, t2);
148 m.
r[2] = F128::SetZeroToLane<3>(tmp2);
149 m.
r[3] = F128::Permute<1, 2, 3, 7>(t2, F128::SetOne());
154 f128 t0 = F128::LoadA4(&p->
m[0][0]);
155 f128 t1 = F128::LoadA4(&p->
m[1][0]);
156 f128 t2 = F128::LoadA4(&p->
m[1][2]);
157 f128 zero = F128::SetZero();
159 m.
r[0] = F128::SetZeroToLane<3>(t0);
160 m.
r[1] = F128::SetZeroToLane<3>(t1);
161 m.
r[2] = F128::Permute<1, 2, 3, 7>(t2, zero);
162 m.
r[3] = F128::Set0001();
167 F128::StoreA16(&p->
m[0][0], m.
r[0]);
168 F128::StoreA16(&p->
m[1][0], m.
r[1]);
169 F128::StoreA16(&p->
m[2][0], m.
r[2]);
170 F128::StoreA16(&p->
m[3][0], m.
r[3]);
174 f128 t0 = F128::Permute<0, 1, 2, 4>(m.
r[0], m.
r[1]);
175 f128 t1 = F128::Permute<1, 2, 4, 5>(m.
r[1], m.
r[2]);
176 f128 t2 = F128::Permute<2, 4, 5, 6>(m.
r[2], m.
r[3]);
177 F128::StoreA16(&p->
m[0][0], t0);
178 F128::StoreA16(&p->
m[1][1], t1);
179 F128::StoreA16(&p->
m[2][2], t2);
183 f128 t0 = F128::Permute<0, 1, 2, 4>(m.
r[0], m.
r[1]);
184 f128 t1 = F128::Permute<1, 2, 4, 5>(m.
r[1], m.
r[2]);
185 F128::StoreA4(&p->
m[0][0], t0);
186 F128::StoreA4(&p->
m[1][1], t1);
187 p->
m[2][2] = F128::GetFloatFromLane<2>(m.
r[2]);
205 f128 c0det, c1det, c2det;
207 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(m.
r[2]);
208 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(m.
r[3]);
210 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(m.
r[2]);
211 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(m.
r[3]);
213 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(m.
r[2]);
214 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(m.
r[3]);
216 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
217 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
218 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
220 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
221 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
222 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
225 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(m.
r[1]);
226 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(m.
r[1]);
227 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(m.
r[1]);
228 f128 r0x = F128::NegateEx<true, false, true, false>(m.
r[0]);
230 f128 det3_neg = F128::Mult(c1det, ccbb_1);
231 det3_neg = F128::MultSub(c0det, baaa_1, det3_neg);
232 det3_neg = F128::MultSub(c2det, dddc_1, det3_neg);
233 return Vector4::Dot(r0x, det3_neg);
243 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 244 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
245 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
246 float32x2_t x00 = vcreate_f32(0ULL);
247 m.
r[0] = vcombine_f32(x10, x00);
248 m.
r[1] = vcombine_f32(x01, x00);
249 m.
r[2] = vcombine_f32(x00, x10);
250 m.
r[3] = vcombine_f32(x00, x01);
252 m.
r[0] = F128::LoadA16(F128::v1000_);
253 m.
r[1] = F128::LoadA16(F128::v0100_);
254 m.
r[2] = F128::LoadA16(F128::v0010_);
255 m.
r[3] = F128::LoadA16(F128::v0001_);
278 F128::StoreA16(&p->
m[0][0], M.
r[0]);
279 F128::StoreA16(&p->
m[1][0], M.
r[1]);
280 F128::StoreA16(&p->
m[2][0], M.
r[2]);
311 f128 detvalue_reciprocal;
315 f128 c0det, c1det, c2det;
317 f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.
r[2], m.
r[1]);
318 f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.
r[2], m.
r[1]);
320 f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.
r[3], m.
r[2]);
321 f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.
r[3], m.
r[2]);
323 f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.
r[1], m.
r[0]);
324 f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.
r[1], m.
r[0]);
326 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
327 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
328 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
330 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
331 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
332 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
335 f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.
r[1], m.
r[0]);
336 f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.
r[2], m.
r[1]);
337 f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.
r[3], m.
r[2]);
339 f128 r0x_m = F128::Permute<0, -1, 4, -1>(m.
r[0], m.
r[2]);
340 f128 r0x_p = F128::Permute<-1, 0, -1, 4>(m.
r[1], m.
r[3]);
341 f128 r0x = F128::Permute<0, 5, 2, 7>(F128::Negate(r0x_m), r0x_p);
343 f128 det3 = F128::Mult(c1det, ccbb_1);
344 det3 = F128::MultSub(c0det, baaa_1, det3);
345 det3 = F128::MultSub(c2det, dddc_1, det3);
347 mydet = Vector4::Dot(r0x, det3);
349 det3 = F128::NegateEx<true, false, true, false>(det3);
350 detvalue_reciprocal = F128::Recp(mydet);
352 ret.
r[0] = F128::Mult(detvalue_reciprocal, det3);
355 f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.
r[1], m.
r[0]);
356 f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.
r[2], m.
r[1]);
357 f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.
r[3], m.
r[2]);
359 f128 det3 = F128::Mult(c0det, baaa_0);
360 det3 = F128::MultAdd(c2det, dddc_0, det3);
361 det3 = F128::MultSub(c1det, ccbb_0, det3);
362 det3 = F128::NegateEx<true, false, true, false>(det3);
363 ret.
r[1] = F128::Mult(detvalue_reciprocal, det3);
379 f128 c0det, c1det, c2det;
381 f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.
r[2], m.
r[1]);
382 f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.
r[2], m.
r[1]);
384 f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.
r[3], m.
r[2]);
385 f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.
r[3], m.
r[2]);
387 f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.
r[1], m.
r[0]);
388 f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.
r[1], m.
r[0]);
390 f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
391 f128 tmp1 = F128::Mult(baaa_0, dddc_1);
392 f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
394 c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
395 c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
396 c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
399 f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.
r[1], m.
r[0]);
400 f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.
r[2], m.
r[1]);
401 f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.
r[3], m.
r[2]);
403 f128 det3 = F128::Mult(c1det, ccbb_3);
404 det3 = F128::MultSub(c0det, baaa_3, det3);
405 det3 = F128::MultSub(c2det, dddc_3, det3);
406 det3 = F128::NegateEx<true, false, true, false>(det3);
408 ret.
r[2] = F128::Mult(detvalue_reciprocal, det3);
411 f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.
r[1], m.
r[0]);
412 f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.
r[2], m.
r[1]);
413 f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.
r[3], m.
r[2]);
415 f128 det3 = F128::Mult(c0det, baaa_2);
416 det3 = F128::MultAdd(c2det, dddc_2, det3);
417 det3 = F128::MultSub(c1det, ccbb_2, det3);
418 det3 = F128::NegateEx<true, false, true, false>(det3);
420 ret.
r[3] = F128::Mult(detvalue_reciprocal, det3);
455 f128 detvalue_reciprocal;
458 f128 c0det, c1det, c2det;
460 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.
r[2]);
461 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.
r[3]);
463 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.
r[2]);
464 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.
r[3]);
466 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.
r[2]);
467 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.
r[3]);
469 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
470 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
471 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
473 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
474 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
475 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
478 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.
r[1]);
479 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.
r[1]);
480 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.
r[1]);
481 f128 r0x = F128::NegateEx<true, false, true, false>(M.
r[0]);
483 f128 det3 = F128::Mult(c1det, ccbb_1);
484 det3 = F128::MultSub(c0det, baaa_1, det3);
485 det3 = F128::MultSub(c2det, dddc_1, det3);
487 detvalue_reciprocal = Vector4::Dot(r0x, det3);
489 *det = detvalue_reciprocal;
492 det3 = F128::NegateEx<true, false, true, false>(det3);
493 detvalue_reciprocal = F128::Recp(detvalue_reciprocal);
495 ret.
r[0] = F128::Mult(detvalue_reciprocal, det3);
498 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.
r[0]);
499 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.
r[0]);
500 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.
r[0]);
502 f128 det3 = F128::Mult(c0det, baaa_0);
503 det3 = F128::MultAdd(c2det, dddc_0, det3);
504 det3 = F128::MultSub(c1det, ccbb_0, det3);
505 det3 = F128::NegateEx<true, false, true, false>(det3);
506 ret.
r[1] = F128::Mult(detvalue_reciprocal, det3);
522 f128 c0det, c1det, c2det;
524 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.
r[0]);
525 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.
r[1]);
527 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.
r[0]);
528 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.
r[1]);
530 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.
r[0]);
531 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.
r[1]);
533 f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
534 f128 tmp1 = F128::Mult(baaa_0, dddc_1);
535 f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
537 c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
538 c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
539 c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
542 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.
r[3]);
543 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.
r[3]);
544 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.
r[3]);
546 f128 det3 = F128::Mult(c1det, ccbb_3);
547 det3 = F128::MultSub(c0det, baaa_3, det3);
548 det3 = F128::MultSub(c2det, dddc_3, det3);
549 det3 = F128::NegateEx<true, false, true, false>(det3);
551 ret.
r[2] = F128::Mult(detvalue_reciprocal, det3);
554 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.
r[2]);
555 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.
r[2]);
556 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.
r[2]);
558 f128 det3 = F128::Mult(c0det, baaa_2);
559 det3 = F128::MultAdd(c2det, dddc_2, det3);
560 det3 = F128::MultSub(c1det, ccbb_2, det3);
561 det3 = F128::NegateEx<true, false, true, false>(det3);
563 ret.
r[3] = F128::Mult(detvalue_reciprocal, det3);
574 f128 cmp0 = F128::CmpEq(x.
r[0], m.
r[0]);
575 f128 cmp1 = F128::CmpEq(x.
r[1], m.
r[1]);
576 f128 cmp2 = F128::CmpEq(x.
r[2], m.
r[2]);
577 f128 cmp3 = F128::CmpEq(x.
r[3], m.
r[3]);
578 cmp0 = F128::And(cmp0, cmp1);
579 cmp2 = F128::And(cmp2, cmp3);
580 cmp0 = F128::And(cmp0, cmp2);
581 return F128::IsAllMaskTrue(cmp0);
587 #ifdef NLIB_F128_SIMD_NOUSE 588 f128 cmp0 = F128::IsInfinite(m.
r[0]);
589 f128 cmp1 = F128::IsInfinite(m.
r[1]);
590 f128 cmp2 = F128::IsInfinite(m.
r[2]);
591 f128 cmp3 = F128::IsInfinite(m.
r[3]);
592 cmp0 = F128::Or(cmp0, cmp1);
593 cmp2 = F128::Or(cmp2, cmp3);
594 cmp0 = F128::Or(cmp0, cmp2);
595 return !F128::IsAllMaskFalse(cmp0);
597 f128 inf_value = F128::SetInfinity();
598 f128 cmp0 = F128::CmpEq(inf_value, F128::Abs(m.
r[0]));
599 f128 cmp1 = F128::CmpEq(inf_value, F128::Abs(m.
r[1]));
600 f128 cmp2 = F128::CmpEq(inf_value, F128::Abs(m.
r[2]));
601 f128 cmp3 = F128::CmpEq(inf_value, F128::Abs(m.
r[3]));
602 cmp0 = F128::Or(cmp0, cmp1);
603 cmp2 = F128::Or(cmp2, cmp3);
604 cmp0 = F128::Or(cmp0, cmp2);
605 return !F128::IsAllMaskFalse(cmp0);
612 f128 cmp0 = F128::IsNaN(m.
r[0]);
613 f128 cmp1 = F128::IsNaN(m.
r[1]);
614 f128 cmp2 = F128::IsNaN(m.
r[2]);
615 f128 cmp3 = F128::IsNaN(m.
r[3]);
616 cmp0 = F128::Or(cmp0, cmp1);
617 cmp2 = F128::Or(cmp2, cmp3);
618 cmp0 = F128::Or(cmp0, cmp2);
619 return !F128::IsAllMaskFalse(cmp0);
626 m.
r[0] = Vector4::Transform(a.
r[0], b);
627 m.
r[1] = Vector4::Transform(a.
r[1], b);
628 m.
r[2] = Vector4::Transform(a.
r[2], b);
629 m.
r[3] = Vector4::Transform(a.
r[3], b);
636 f128 r0 = Vector4::Transform(a.
r[0], b);
637 f128 r1 = Vector4::Transform(a.
r[1], b);
638 f128 r2 = Vector4::Transform(a.
r[2], b);
639 f128 r3 = Vector4::Transform(a.
r[3], b);
656 f128 zero = F128::SetZero();
657 m.
r[0] = F128::SetFloatToLane<0>(zero, scale_x);
658 m.
r[1] = F128::SetFloatToLane<1>(zero, scale_y);
659 m.
r[2] = F128::SetFloatToLane<2>(zero, scale_z);
660 m.
r[3] = F128::Set0001();
671 f128 zero = F128::SetZero();
672 m.
r[0] = F128::Splat<false, true, true, true>(scale, zero);
673 m.
r[1] = F128::Splat<true, false, true, true>(scale, zero);
674 m.
r[2] = F128::Splat<true, true, false, true>(scale, zero);
675 m.
r[3] = F128::Set0001();
686 m.
r[0] = F128::Set1000();
687 m.
r[1] = F128::Set0100();
688 m.
r[2] = F128::Set0010();
689 m.
r[3] = F128::SetValue(ofs_x, ofs_y, ofs_z, 1.f);
700 m.
r[0] = F128::Set1000();
701 m.
r[1] = F128::Set0100();
702 m.
r[2] = F128::Set0010();
703 m.
r[3] = F128::Permute<0, 1, 2, 4>(ofs, m.
r[0]);
715 f128 r1 = F128::SetFloatToLane<1>(zero, cos_value);
716 r1 = F128::SetFloatToLane<2>(r1, sin_value);
717 f128 r2 = F128::SetFloatToLane<1>(zero, -sin_value);
718 r2 = F128::SetFloatToLane<2>(r2, cos_value);
720 m.
r[0] = F128::Set1000();
723 m.
r[3] = F128::Set0001();
735 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
736 r0 = F128::SetFloatToLane<2>(r0, -sin_value);
737 f128 r2 = F128::SetFloatToLane<0>(zero, sin_value);
738 r2 = F128::SetFloatToLane<2>(r2, cos_value);
741 m.
r[1] = F128::Set0100();
743 m.
r[3] = F128::Set0001();
755 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
756 r0 = F128::SetFloatToLane<1>(r0, sin_value);
757 f128 r1 = F128::SetFloatToLane<0>(zero, -sin_value);
758 r1 = F128::SetFloatToLane<1>(r1, cos_value);
762 m.
r[2] = F128::Set0010();
763 m.
r[3] = F128::Set0001();
775 f128 nn = F128::Mult(axis_normalized, axis_normalized);
777 c1 = F128::SetValue(1.f - cos_value,
each_float);
778 diagonal = F128::MultAdd(c1, nn, c);
779 diagonal = F128::SetZeroToLane<3>(diagonal);
782 f128 zxy = F128::Swizzle<2, 0, 1, 2>(axis_normalized);
784 f128 xy_yz_xz = F128::Mult(axis_normalized, F128::Swizzle<1, 2, 0, 3>(axis_normalized));
785 xy_yz_xz = F128::Mult(c1, xy_yz_xz);
786 f128 plus = F128::MultAdd(s, zxy, xy_yz_xz);
787 f128 minus = F128::MultSub(s, zxy, xy_yz_xz);
789 f128 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
790 f128 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
793 m.
r[0] = F128::Permute<4, 0, 1, 7>(t1, diagonal);
794 m.r[1] = F128::Permute<2, 5, 0, 7>(t0, diagonal);
795 m.r[2] = F128::Permute<1, 3, 6, 7>(t0, diagonal);
796 m.r[3] = F128::Set0001();
804 f128 q2 = F128::Add(quat, quat);
805 f128 qq2 = F128::Mult(quat, q2);
808 t0 = F128::Swizzle<1, 0, 0, -1>(qq2);
809 t1 = F128::Swizzle<2, 2, 1, -1>(qq2);
811 f128 diagonal = F128::Sub(F128::Sub(F128::SetOne(), t0), t1);
812 diagonal = F128::SetFloatToLane<3>(diagonal, 0.f);
814 t0 = F128::Swizzle<1, 0, 0, -1>(quat);
815 t1 = F128::Swizzle<2, 2, 1, -1>(q2);
816 f128 yz_xz_xy = F128::Mult(t0, t1);
819 f128 wx_wy_wz = F128::Mult(q2, t0);
821 f128 plus = F128::Add(yz_xz_xy, wx_wy_wz);
822 f128 minus = F128::Sub(yz_xz_xy, wx_wy_wz);
824 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
825 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
828 m.
r[0] = F128::Permute<4, 1, 3, 7>(t0, diagonal);
829 m.r[1] = F128::Permute<1, 5, 0, 7>(t1, diagonal);
830 m.r[2] = F128::Permute<0, 2, 6, 7>(t0, diagonal);
831 m.r[3] = F128::Set0001();
843 f128 sz_cz_sz_cz = F128::Permute<2, 6, 2, 6>(sin_xyz, cos_xyz);
844 f128 sy_cy_cy_sy = F128::Permute<1, 5, 5, 1>(sin_xyz, cos_xyz);
845 f128 tmp = F128::Mult(sz_cz_sz_cz, sy_cy_cy_sy);
847 tmp = F128::Swizzle<1, 0, 3, 2>(tmp);
848 tmp = F128::NegateEx<false, false, true, true>(tmp);
849 m00_12_02_10 = F128::Add(tmp, m00_12_02_10);
853 f128 sy_sz_cy_cz = F128::Permute<1, 2, 5, 6>(sin_xyz, cos_xyz);
854 m20_01_22_11 = F128::Mult<0>(cos_xyz, sy_sz_cy_cz,
each_select32);
857 f128 r2 = F128::SetFloatToLane<3>(m20_01_22_11, 0.f);
858 f128 r1 = F128::Permute<3, 7, 1, 1>(m00_12_02_10, m20_01_22_11);
861 m.
r[0] = F128::Permute<0, 5, 2, 7>(m00_12_02_10, r2);
862 m.r[1] = F128::SetZeroToLane<3>(r1);
863 m.r[2] = F128::SetFloatToLane<1>(r2, -F128::GetFloatFromLane<0>(sin_xyz));
864 m.r[3] = F128::Set0001();
871 SimdVector r0 = Vector3::Cross(up_dir_normalized, eye_dir_normalized);
872 SimdVector r1 = Vector3::Cross(eye_dir_normalized, r0);
875 neg = F128::SetZeroToLane<3>(neg);
876 f128 d012 = Vector4::Dot3(neg, r0, r1, eye_dir_normalized);
880 m.
r[2] = eye_dir_normalized;
881 m.
r[3] = F128::Set0001();
883 m.
r[3] = F128::SetFloatToLane<3>(d012, 1.f);
886 f128 d0 = Vector3::Dot(r0, neg);
887 f128 d1 = Vector3::Dot(r1, neg);
888 f128 d2 = Vector3::Dot(eye_dir_normalized, neg);
890 m.
r[0] = F128::Splat<false, false, false, true>(r0, d0);
891 m.
r[1] = F128::Splat<false, false, false, true>(r1, d1);
892 m.
r[2] = F128::Splat<false, false, false, true>(eye_dir_normalized, d2);
893 m.
r[3] = F128::Set0001();
901 SimdVector eye_dir = F128::Sub(at_pos, eye_pos);
902 eye_dir = Vector3::Normalize(eye_dir);
903 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
909 return LookToLh(eye_pos, F128::Negate(eye_dir_normalized), up_dir_normalized);
915 SimdVector eye_dir = F128::Sub(eye_pos, at_pos);
916 eye_dir = Vector3::Normalize(eye_dir);
917 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
921 NLIB_M(
SimdMatrix) Matrix::PerspectiveLh(
float width,
float height,
float near_z,
923 float near2 = near_z + near_z;
924 float range = far_z / (far_z - near_z);
925 f128 zero = F128::SetZero();
926 f128 v = F128::SetValue(near2 / width, near2 / height, range, -range * near_z);
928 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
929 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
930 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
931 m.
r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
932 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
937 NLIB_M(
SimdMatrix) Matrix::PerspectiveRh(
float width,
float height,
float near_z,
939 float near2 = near_z + near_z;
940 float range = far_z / (near_z - far_z);
941 f128 zero = F128::SetZero();
942 f128 v = F128::SetValue(near2 / width, near2 / height, range, range * near_z);
944 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
945 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
946 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
947 m.
r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
948 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
953 NLIB_M(
SimdMatrix) Matrix::PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
955 float height = half_fovy_cos / half_fovy_sin;
956 float width = height / aspect;
957 float range = far_z / (far_z - near_z);
959 f128 zero = F128::SetZero();
960 f128 v = F128::SetValue(width, height, range, -range * near_z);
962 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
963 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
964 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
965 m.
r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
966 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
971 NLIB_M(
SimdMatrix) Matrix::PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
973 float height = half_fovy_cos / half_fovy_sin;
974 float width = height / aspect;
975 float range = far_z / (near_z - far_z);
977 f128 zero = F128::SetZero();
978 f128 v = F128::SetValue(width, height, range, range * near_z);
980 m.
r[0] = F128::Splat<false, true, true, true>(v, zero);
981 m.
r[1] = F128::Splat<true, false, true, true>(v, zero);
982 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
983 m.
r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
984 m.
r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
989 NLIB_M(
SimdMatrix) Matrix::PerspectiveOffCenterLh(
float left,
float right,
float bottom,
float top,
991 float near2 = near_z + near_z;
994 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
995 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
996 div = F128::Div(a, b);
999 f128 zero = F128::SetZero();
1000 f128 v0 = F128::SetValue(near2, near2, -near_z, 1.f);
1001 f128 r2 = F128::SetValue(-(left + right), -(top + bottom), 1.f, 1.f);
1002 v0 = F128::Mult(v0, div);
1005 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
1006 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
1007 m.
r[2] = F128::Mult(r2, div);
1008 m.
r[3] = F128::Splat<true, true, false, true>(v0, zero);
1013 NLIB_M(
SimdMatrix) Matrix::PerspectiveOffCenterRh(
float left,
float right,
float bottom,
float top,
1015 float near2 = near_z + near_z;
1018 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
1019 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, -1.f);
1020 div = F128::Div(a, b);
1023 f128 zero = F128::SetZero();
1024 f128 v0 = F128::SetValue(near2, near2, near_z, 1.f);
1025 f128 r2 = F128::SetValue((left + right), (top + bottom), 1.f, 1.f);
1026 v0 = F128::Mult(v0, div);
1029 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
1030 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
1031 m.
r[2] = F128::Mult(r2, div);
1032 m.
r[3] = F128::Splat<true, true, false, true>(v0, zero);
1037 NLIB_M(
SimdMatrix) Matrix::OrthographicLh(
float width,
float height,
float near_z,
1041 f128 a = F128::SetValue(2.f, 2.f, 1.f, -near_z);
1042 f128 b = F128::SetValue(width, height, far_z - near_z, far_z - near_z);
1043 div = F128::Div(a, b);
1045 f128 zero = F128::SetZero();
1048 m.
r[0] = F128::Splat<false, true, true, true>(div, zero);
1049 m.
r[1] = F128::Splat<true, false, true, true>(div, zero);
1050 m.
r[2] = F128::Splat<true, true, false, true>(div, zero);
1051 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1052 m.
r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1057 NLIB_M(
SimdMatrix) Matrix::OrthographicRh(
float width,
float height,
float near_z,
1061 f128 a = F128::SetValue(2.f, 2.f, 1.f, near_z);
1062 f128 b = F128::SetValue(width, height, near_z - far_z, near_z - far_z);
1063 div = F128::Div(a, b);
1065 f128 zero = F128::SetZero();
1068 m.
r[0] = F128::Splat<false, true, true, true>(div, zero);
1069 m.
r[1] = F128::Splat<true, false, true, true>(div, zero);
1070 m.
r[2] = F128::Splat<true, true, false, true>(div, zero);
1071 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1072 m.
r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1077 NLIB_M(
SimdMatrix) Matrix::OrthographicOffCenterLh(
float left,
float right,
float bottom,
float top,
1081 f128 a = F128::SetOne();
1082 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
1083 div = F128::Div(a, b);
1085 f128 zero = F128::SetZero();
1086 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1087 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), -near_z, 1.f);
1088 v0 = F128::Mult(v0, div);
1091 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
1092 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
1093 m.
r[2] = F128::Splat<true, true, false, true>(v0, zero);
1094 m.
r[3] = F128::Mult(r3, div);
1099 NLIB_M(
SimdMatrix) Matrix::OrthographicOffCenterRh(
float left,
float right,
float bottom,
float top,
1103 f128 a = F128::SetOne();
1104 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, 1.f);
1105 div = F128::Div(a, b);
1107 f128 zero = F128::SetZero();
1108 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1109 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), near_z, 1.f);
1110 v0 = F128::Mult(v0, div);
1113 m.
r[0] = F128::Splat<false, true, true, true>(v0, zero);
1114 m.
r[1] = F128::Splat<true, false, true, true>(v0, zero);
1115 m.
r[2] = F128::Splat<true, true, false, true>(v0, zero);
1116 m.
r[3] = F128::Mult(r3, div);
1123 SimdPlane plane = F128::Mult(Vector3::RecpLength(shadow_plane), shadow_plane);
1125 f128 r0 = Vector4::DotEx<true, false, false, false>(plane, light_pos);
1126 plane = F128::Negate(plane);
1127 f128 r1 = F128::RotateLeft<1>(r0);
1128 f128 r2 = F128::RotateLeft<2>(r0);
1129 f128 r3 = F128::RotateLeft<3>(r0);
1133 m.r[1] = F128::MultAdd<1>(plane, light_pos, r1,
each_select32);
1134 m.r[2] = F128::MultAdd<2>(plane, light_pos, r2,
each_select32);
1135 m.r[3] = F128::MultAdd<3>(plane, light_pos, r3,
each_select32);
1142 SimdPlane plane = F128::Mult(Vector3::RecpLength(reflection_plane), reflection_plane);
1143 f128 minus_2n = F128::Mult(-2.f, plane);
1144 minus_2n = F128::SetZeroToLane<3>(minus_2n);
1147 m.r[0] = F128::MultAdd<0>(plane, minus_2n, m.r[0],
each_select32);
1148 m.r[1] = F128::MultAdd<1>(plane, minus_2n, m.r[1],
each_select32);
1149 m.r[2] = F128::MultAdd<2>(plane, minus_2n, m.r[2],
each_select32);
1150 m.r[3] = F128::MultAdd<3>(plane, minus_2n, m.r[3],
each_select32);
1164 f128 dot_x = Vector3::DotEx<true, false, false, true>(m.
r[0], m.
r[0]);
1165 f128 dot_y = Vector3::DotEx<false, true, false, true>(m.
r[1], m.
r[1]);
1166 f128 dot_z = Vector3::DotEx<false, false, true, true>(m.
r[2], m.
r[2]);
1167 f128 dot = F128::Or(dot_x, dot_y);
1168 dot = F128::Or(dot, dot_z);
1169 recp_scale = F128::RecpSqrt(dot);
1170 *scale = F128::Mult(dot, recp_scale);
1177 rot->
r[3] = F128::Set0001();
1182 #endif // NLIB_DOXYGEN 1187 #endif // INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ The class with the collection of functions that handle 4x4 matrices.
float m[3][4]
A 2D 3x4 array.
f128arg SimdVectorArg
f128arg is defined using typedef.
float m[4][3]
A 2D 4x3 array.
float m[4][4]
A 2D 4x4 array.
#define NLIB_F128_TRANSPOSE(row0, row1, row2, row3)
A macro for in-place matrix transposition.
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
f128arg SimdQuaternionArg
f128arg is defined using typedef.
f128arg SimdPlaneArg
f128arg is defined using typedef.
f128 r[4]
Keeps each row of a 4x4 matrix.
The structure for keeping a 4x4 matrix.
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
Defines a three-dimensional vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
Defines a four-dimensional vector.
float m[3][3]
A 2D 3x3 array.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...