16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ 17 #define INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_ 50 static SimdMatrix __vectorcall FromScaling(
float scale_x,
float scale_y,
53 static SimdMatrix __vectorcall FromTranslation(
float ofs_x,
float ofs_y,
76 static SimdMatrix __vectorcall PerspectiveLh(
float width,
float height,
float near_z,
78 static SimdMatrix __vectorcall PerspectiveRh(
float width,
float height,
float near_z,
80 static SimdMatrix __vectorcall PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
81 float aspect,
float near_z,
83 static SimdMatrix __vectorcall PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
84 float aspect,
float near_z,
86 static SimdMatrix __vectorcall PerspectiveOffCenterLh(
float left,
float right,
float bottom,
87 float top,
float near_z,
89 static SimdMatrix __vectorcall PerspectiveOffCenterRh(
float left,
float right,
float bottom,
90 float top,
float near_z,
93 static SimdMatrix __vectorcall OrthographicLh(
float width,
float height,
float near_z,
95 static SimdMatrix __vectorcall OrthographicRh(
float width,
float height,
float near_z,
97 static SimdMatrix __vectorcall OrthographicOffCenterLh(
float left,
float right,
float bottom,
98 float top,
float near_z,
100 static SimdMatrix __vectorcall OrthographicOffCenterRh(
float left,
float right,
float bottom,
101 float top,
float near_z,
117 #define NLIB_M(tp) inline tp __vectorcall 121 m.
r[0] = F128::LoadA16(&p->m[0][0]);
122 m.
r[1] = F128::LoadA16(&p->m[1][0]);
123 m.
r[2] = F128::LoadA16(&p->m[2][0]);
124 m.
r[3] = F128::LoadA16(&p->m[3][0]);
129 NLIB_M(SimdMatrix) Matrix::LoadFloat3x4(
const Float3x4* p)
NLIB_NOEXCEPT {
131 m.r[0] = F128::LoadA16(&p->m[0][0]);
132 m.r[1] = F128::LoadA16(&p->m[1][0]);
133 m.r[2] = F128::LoadA16(&p->m[2][0]);
134 m.r[3] = F128::Set0001();
139 NLIB_M(SimdMatrix) Matrix::LoadFloat4x3(
const Float4x3* p)
NLIB_NOEXCEPT {
140 f128 t0 = F128::LoadA16(&p->m[0][0]);
141 f128 t1 = F128::LoadA16(&p->m[1][1]);
142 f128 t2 = F128::LoadA16(&p->m[2][2]);
144 m.r[0] = F128::SetZeroToLane<3>(t0);
145 f128 tmp1 = F128::Permute<3, 4, 5, -1>(t0, t1);
146 m.r[1] = F128::SetZeroToLane<3>(tmp1);
147 f128 tmp2 = F128::Permute<2, 3, 4, -1>(t1, t2);
148 m.r[2] = F128::SetZeroToLane<3>(tmp2);
149 m.r[3] = F128::Permute<1, 2, 3, 7>(t2, F128::SetOne());
153 NLIB_M(SimdMatrix) Matrix::LoadFloat3x3(
const Float3x3* p)
NLIB_NOEXCEPT {
154 f128 t0 = F128::LoadA4(&p->m[0][0]);
155 f128 t1 = F128::LoadA4(&p->m[1][0]);
156 f128 t2 = F128::LoadA4(&p->m[1][2]);
157 f128 zero = F128::SetZero();
159 m.r[0] = F128::SetZeroToLane<3>(t0);
160 m.r[1] = F128::SetZeroToLane<3>(t1);
161 m.r[2] = F128::Permute<1, 2, 3, 7>(t2, zero);
162 m.r[3] = F128::Set0001();
166 inline void __vectorcall Matrix::StoreFloat4x4(Float4x4* p, SimdMatrixArg m)
NLIB_NOEXCEPT {
167 F128::StoreA16(&p->m[0][0], m.r[0]);
168 F128::StoreA16(&p->m[1][0], m.r[1]);
169 F128::StoreA16(&p->m[2][0], m.r[2]);
170 F128::StoreA16(&p->m[3][0], m.r[3]);
173 inline void __vectorcall Matrix::StoreFloat4x3(Float4x3* p, SimdMatrixArg m)
NLIB_NOEXCEPT {
174 f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
175 f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
176 f128 t2 = F128::Permute<2, 4, 5, 6>(m.r[2], m.r[3]);
177 F128::StoreA16(&p->m[0][0], t0);
178 F128::StoreA16(&p->m[1][1], t1);
179 F128::StoreA16(&p->m[2][2], t2);
182 inline void __vectorcall Matrix::StoreFloat3x3(Float3x3* p, SimdMatrixArg m)
NLIB_NOEXCEPT {
183 f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
184 f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
185 F128::StoreA4(&p->m[0][0], t0);
186 F128::StoreA4(&p->m[1][1], t1);
187 p->m[2][2] = F128::GetFloatFromLane<2>(m.r[2]);
205 f128 c0det, c1det, c2det;
207 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(m.r[2]);
208 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(m.r[3]);
210 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(m.r[2]);
211 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(m.r[3]);
213 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(m.r[2]);
214 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(m.r[3]);
216 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
217 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
218 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
220 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
221 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
222 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
225 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(m.r[1]);
226 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(m.r[1]);
227 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(m.r[1]);
228 f128 r0x = F128::NegateEx<true, false, true, false>(m.r[0]);
230 f128 det3_neg = F128::Mult(c1det, ccbb_1);
231 det3_neg = F128::MultSub(c0det, baaa_1, det3_neg);
232 det3_neg = F128::MultSub(c2det, dddc_1, det3_neg);
233 return Vector4::Dot(r0x, det3_neg);
243 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE) 244 float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
245 float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
246 float32x2_t x00 = vcreate_f32(0ULL);
247 m.r[0] = vcombine_f32(x10, x00);
248 m.r[1] = vcombine_f32(x01, x00);
249 m.r[2] = vcombine_f32(x00, x10);
250 m.r[3] = vcombine_f32(x00, x01);
252 m.r[0] = F128::LoadA16(F128::v1000_);
253 m.r[1] = F128::LoadA16(F128::v0100_);
254 m.r[2] = F128::LoadA16(F128::v0010_);
255 m.r[3] = F128::LoadA16(F128::v0001_);
262 NLIB_M(SimdMatrix) Matrix::Transpose(SimdMatrixArg m)
NLIB_NOEXCEPT {
276 inline void __vectorcall Matrix::StoreFloat3x4(Float3x4* p, SimdMatrixArg m)
NLIB_NOEXCEPT {
277 SimdMatrix M = Matrix::Transpose(m);
278 F128::StoreA16(&p->m[0][0], M.r[0]);
279 F128::StoreA16(&p->m[1][0], M.r[1]);
280 F128::StoreA16(&p->m[2][0], M.r[2]);
311 f128 detvalue_reciprocal;
315 f128 c0det, c1det, c2det;
317 f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.r[2], m.r[1]);
318 f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.r[2], m.r[1]);
320 f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.r[3], m.r[2]);
321 f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.r[3], m.r[2]);
323 f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.r[1], m.r[0]);
324 f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.r[1], m.r[0]);
326 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
327 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
328 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
330 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
331 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
332 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
335 f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.r[1], m.r[0]);
336 f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.r[2], m.r[1]);
337 f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.r[3], m.r[2]);
339 f128 r0x_m = F128::Permute<0, -1, 4, -1>(m.r[0], m.r[2]);
340 f128 r0x_p = F128::Permute<-1, 0, -1, 4>(m.r[1], m.r[3]);
341 f128 r0x = F128::Permute<0, 5, 2, 7>(F128::Negate(r0x_m), r0x_p);
343 f128 det3 = F128::Mult(c1det, ccbb_1);
344 det3 = F128::MultSub(c0det, baaa_1, det3);
345 det3 = F128::MultSub(c2det, dddc_1, det3);
347 mydet = Vector4::Dot(r0x, det3);
349 det3 = F128::NegateEx<true, false, true, false>(det3);
350 detvalue_reciprocal = F128::Recp(mydet);
352 ret.r[0] = F128::Mult(detvalue_reciprocal, det3);
355 f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.r[1], m.r[0]);
356 f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.r[2], m.r[1]);
357 f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.r[3], m.r[2]);
359 f128 det3 = F128::Mult(c0det, baaa_0);
360 det3 = F128::MultAdd(c2det, dddc_0, det3);
361 det3 = F128::MultSub(c1det, ccbb_0, det3);
362 det3 = F128::NegateEx<true, false, true, false>(det3);
363 ret.r[1] = F128::Mult(detvalue_reciprocal, det3);
379 f128 c0det, c1det, c2det;
381 f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.r[2], m.r[1]);
382 f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.r[2], m.r[1]);
384 f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.r[3], m.r[2]);
385 f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.r[3], m.r[2]);
387 f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.r[1], m.r[0]);
388 f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.r[1], m.r[0]);
390 f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
391 f128 tmp1 = F128::Mult(baaa_0, dddc_1);
392 f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
394 c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
395 c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
396 c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
399 f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.r[1], m.r[0]);
400 f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.r[2], m.r[1]);
401 f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.r[3], m.r[2]);
403 f128 det3 = F128::Mult(c1det, ccbb_3);
404 det3 = F128::MultSub(c0det, baaa_3, det3);
405 det3 = F128::MultSub(c2det, dddc_3, det3);
406 det3 = F128::NegateEx<true, false, true, false>(det3);
408 ret.r[2] = F128::Mult(detvalue_reciprocal, det3);
411 f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.r[1], m.r[0]);
412 f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.r[2], m.r[1]);
413 f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.r[3], m.r[2]);
415 f128 det3 = F128::Mult(c0det, baaa_2);
416 det3 = F128::MultAdd(c2det, dddc_2, det3);
417 det3 = F128::MultSub(c1det, ccbb_2, det3);
418 det3 = F128::NegateEx<true, false, true, false>(det3);
420 ret.r[3] = F128::Mult(detvalue_reciprocal, det3);
431 SimdMatrix M = Transpose(m);
455 f128 detvalue_reciprocal;
458 f128 c0det, c1det, c2det;
460 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
461 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
463 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
464 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
466 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
467 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
469 f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
470 f128 tmp1 = F128::Mult(baaa_2, dddc_3);
471 f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
473 c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
474 c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
475 c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
478 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
479 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
480 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
481 f128 r0x = F128::NegateEx<true, false, true, false>(M.r[0]);
483 f128 det3 = F128::Mult(c1det, ccbb_1);
484 det3 = F128::MultSub(c0det, baaa_1, det3);
485 det3 = F128::MultSub(c2det, dddc_1, det3);
487 detvalue_reciprocal = Vector4::Dot(r0x, det3);
489 *det = detvalue_reciprocal;
492 det3 = F128::NegateEx<true, false, true, false>(det3);
493 detvalue_reciprocal = F128::Recp(detvalue_reciprocal);
495 ret.r[0] = F128::Mult(detvalue_reciprocal, det3);
498 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
499 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
500 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
502 f128 det3 = F128::Mult(c0det, baaa_0);
503 det3 = F128::MultAdd(c2det, dddc_0, det3);
504 det3 = F128::MultSub(c1det, ccbb_0, det3);
505 det3 = F128::NegateEx<true, false, true, false>(det3);
506 ret.r[1] = F128::Mult(detvalue_reciprocal, det3);
522 f128 c0det, c1det, c2det;
524 f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
525 f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
527 f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
528 f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
530 f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
531 f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
533 f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
534 f128 tmp1 = F128::Mult(baaa_0, dddc_1);
535 f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
537 c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
538 c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
539 c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
542 f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
543 f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
544 f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
546 f128 det3 = F128::Mult(c1det, ccbb_3);
547 det3 = F128::MultSub(c0det, baaa_3, det3);
548 det3 = F128::MultSub(c2det, dddc_3, det3);
549 det3 = F128::NegateEx<true, false, true, false>(det3);
551 ret.r[2] = F128::Mult(detvalue_reciprocal, det3);
554 f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
555 f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
556 f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
558 f128 det3 = F128::Mult(c0det, baaa_2);
559 det3 = F128::MultAdd(c2det, dddc_2, det3);
560 det3 = F128::MultSub(c1det, ccbb_2, det3);
561 det3 = F128::NegateEx<true, false, true, false>(det3);
563 ret.r[3] = F128::Mult(detvalue_reciprocal, det3);
572 NLIB_M(
bool) Matrix::IsIdentity(SimdMatrixArg m)
NLIB_NOEXCEPT {
573 SimdMatrix x = Matrix::Identity();
574 f128 cmp0 = F128::CmpEq(x.r[0], m.r[0]);
575 f128 cmp1 = F128::CmpEq(x.r[1], m.r[1]);
576 f128 cmp2 = F128::CmpEq(x.r[2], m.r[2]);
577 f128 cmp3 = F128::CmpEq(x.r[3], m.r[3]);
578 cmp0 = F128::And(cmp0, cmp1);
579 cmp2 = F128::And(cmp2, cmp3);
580 cmp0 = F128::And(cmp0, cmp2);
581 return F128::IsAllMaskTrue(cmp0);
586 NLIB_M(
bool) Matrix::IsInfinite(SimdMatrixArg m)
NLIB_NOEXCEPT {
587 #ifdef NLIB_F128_SIMD_NOUSE 588 f128 cmp0 = F128::IsInfinite(m.r[0]);
589 f128 cmp1 = F128::IsInfinite(m.r[1]);
590 f128 cmp2 = F128::IsInfinite(m.r[2]);
591 f128 cmp3 = F128::IsInfinite(m.r[3]);
592 cmp0 = F128::Or(cmp0, cmp1);
593 cmp2 = F128::Or(cmp2, cmp3);
594 cmp0 = F128::Or(cmp0, cmp2);
595 return !F128::IsAllMaskFalse(cmp0);
597 f128 inf_value = F128::SetInfinity();
598 f128 cmp0 = F128::CmpEq(inf_value, F128::Abs(m.r[0]));
599 f128 cmp1 = F128::CmpEq(inf_value, F128::Abs(m.r[1]));
600 f128 cmp2 = F128::CmpEq(inf_value, F128::Abs(m.r[2]));
601 f128 cmp3 = F128::CmpEq(inf_value, F128::Abs(m.r[3]));
602 cmp0 = F128::Or(cmp0, cmp1);
603 cmp2 = F128::Or(cmp2, cmp3);
604 cmp0 = F128::Or(cmp0, cmp2);
605 return !F128::IsAllMaskFalse(cmp0);
612 f128 cmp0 = F128::IsNaN(m.r[0]);
613 f128 cmp1 = F128::IsNaN(m.r[1]);
614 f128 cmp2 = F128::IsNaN(m.r[2]);
615 f128 cmp3 = F128::IsNaN(m.r[3]);
616 cmp0 = F128::Or(cmp0, cmp1);
617 cmp2 = F128::Or(cmp2, cmp3);
618 cmp0 = F128::Or(cmp0, cmp2);
619 return !F128::IsAllMaskFalse(cmp0);
624 NLIB_M(SimdMatrix) Matrix::Mult(SimdMatrixArg a, SimdMatrixArg b)
NLIB_NOEXCEPT {
626 m.r[0] = Vector4::Transform(a.r[0], b);
627 m.r[1] = Vector4::Transform(a.r[1], b);
628 m.r[2] = Vector4::Transform(a.r[2], b);
629 m.r[3] = Vector4::Transform(a.r[3], b);
635 NLIB_M(SimdMatrix) Matrix::MultTranspose(SimdMatrixArg a, SimdMatrixArg b)
NLIB_NOEXCEPT {
636 f128 r0 = Vector4::Transform(a.r[0], b);
637 f128 r1 = Vector4::Transform(a.r[1], b);
638 f128 r2 = Vector4::Transform(a.r[2], b);
639 f128 r3 = Vector4::Transform(a.r[3], b);
654 NLIB_M(SimdMatrix) Matrix::FromScaling(
float scale_x,
float scale_y,
float scale_z)
NLIB_NOEXCEPT {
656 f128 zero = F128::SetZero();
657 m.r[0] = F128::SetFloatToLane<0>(zero, scale_x);
658 m.r[1] = F128::SetFloatToLane<1>(zero, scale_y);
659 m.r[2] = F128::SetFloatToLane<2>(zero, scale_z);
660 m.r[3] = F128::Set0001();
671 f128 zero = F128::SetZero();
672 m.r[0] = F128::Splat<false, true, true, true>(scale, zero);
673 m.r[1] = F128::Splat<true, false, true, true>(scale, zero);
674 m.r[2] = F128::Splat<true, true, false, true>(scale, zero);
675 m.r[3] = F128::Set0001();
684 NLIB_M(SimdMatrix) Matrix::FromTranslation(
float ofs_x,
float ofs_y,
float ofs_z)
NLIB_NOEXCEPT {
686 m.r[0] = F128::Set1000();
687 m.r[1] = F128::Set0100();
688 m.r[2] = F128::Set0010();
689 m.r[3] = F128::SetValue(ofs_x, ofs_y, ofs_z, 1.f);
700 m.r[0] = F128::Set1000();
701 m.r[1] = F128::Set0100();
702 m.r[2] = F128::Set0010();
703 m.r[3] = F128::Permute<0, 1, 2, 4>(ofs, m.r[0]);
712 NLIB_M(SimdMatrix) Matrix::FromRotationX(
float sin_value,
float cos_value)
NLIB_NOEXCEPT {
715 f128 r1 = F128::SetFloatToLane<1>(zero, cos_value);
716 r1 = F128::SetFloatToLane<2>(r1, sin_value);
717 f128 r2 = F128::SetFloatToLane<1>(zero, -sin_value);
718 r2 = F128::SetFloatToLane<2>(r2, cos_value);
720 m.r[0] = F128::Set1000();
723 m.r[3] = F128::Set0001();
732 NLIB_M(SimdMatrix) Matrix::FromRotationY(
float sin_value,
float cos_value)
NLIB_NOEXCEPT {
735 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
736 r0 = F128::SetFloatToLane<2>(r0, -sin_value);
737 f128 r2 = F128::SetFloatToLane<0>(zero, sin_value);
738 r2 = F128::SetFloatToLane<2>(r2, cos_value);
741 m.r[1] = F128::Set0100();
743 m.r[3] = F128::Set0001();
752 NLIB_M(SimdMatrix) Matrix::FromRotationZ(
float sin_value,
float cos_value)
NLIB_NOEXCEPT {
755 f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
756 r0 = F128::SetFloatToLane<1>(r0, sin_value);
757 f128 r1 = F128::SetFloatToLane<0>(zero, -sin_value);
758 r1 = F128::SetFloatToLane<1>(r1, cos_value);
762 m.r[2] = F128::Set0010();
763 m.r[3] = F128::Set0001();
771 Matrix::FromRotationAxisAndSinCos(
SimdVectorArg axis_normalized,
float sin_value,
776 f128 nn = F128::Mult(axis_normalized, axis_normalized);
778 c1 = F128::SetValue(1.f - cos_value,
each_float);
779 diagonal = F128::MultAdd(c1,
nn, c);
780 diagonal = F128::SetZeroToLane<3>(diagonal);
783 f128 zxy = F128::Swizzle<2, 0, 1, 2>(axis_normalized);
785 f128 xy_yz_xz = F128::Mult(axis_normalized, F128::Swizzle<1, 2, 0, 3>(axis_normalized));
786 xy_yz_xz = F128::Mult(c1, xy_yz_xz);
787 f128 plus = F128::MultAdd(s, zxy, xy_yz_xz);
788 f128 minus = F128::MultSub(s, zxy, xy_yz_xz);
790 f128 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
791 f128 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
794 m.r[0] = F128::Permute<4, 0, 1, 7>(t1, diagonal);
795 m.r[1] = F128::Permute<2, 5, 0, 7>(t0, diagonal);
796 m.r[2] = F128::Permute<1, 3, 6, 7>(t0, diagonal);
797 m.r[3] = F128::Set0001();
805 f128 q2 = F128::Add(quat, quat);
806 f128 qq2 = F128::Mult(quat, q2);
809 t0 = F128::Swizzle<1, 0, 0, -1>(qq2);
810 t1 = F128::Swizzle<2, 2, 1, -1>(qq2);
812 f128 diagonal = F128::Sub(F128::Sub(F128::SetOne(), t0), t1);
813 diagonal = F128::SetFloatToLane<3>(diagonal, 0.f);
815 t0 = F128::Swizzle<1, 0, 0, -1>(quat);
816 t1 = F128::Swizzle<2, 2, 1, -1>(q2);
817 f128 yz_xz_xy = F128::Mult(t0, t1);
820 f128 wx_wy_wz = F128::Mult(q2, t0);
822 f128 plus = F128::Add(yz_xz_xy, wx_wy_wz);
823 f128 minus = F128::Sub(yz_xz_xy, wx_wy_wz);
825 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
826 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
829 m.r[0] = F128::Permute<4, 1, 3, 7>(t0, diagonal);
830 m.r[1] = F128::Permute<1, 5, 0, 7>(t1, diagonal);
831 m.r[2] = F128::Permute<0, 2, 6, 7>(t0, diagonal);
832 m.r[3] = F128::Set0001();
844 f128 sz_cz_sz_cz = F128::Permute<2, 6, 2, 6>(sin_xyz, cos_xyz);
845 f128 sy_cy_cy_sy = F128::Permute<1, 5, 5, 1>(sin_xyz, cos_xyz);
846 f128 tmp = F128::Mult(sz_cz_sz_cz, sy_cy_cy_sy);
848 tmp = F128::Swizzle<1, 0, 3, 2>(tmp);
849 tmp = F128::NegateEx<false, false, true, true>(tmp);
850 m00_12_02_10 = F128::Add(tmp, m00_12_02_10);
854 f128 sy_sz_cy_cz = F128::Permute<1, 2, 5, 6>(sin_xyz, cos_xyz);
855 m20_01_22_11 = F128::Mult<0>(cos_xyz, sy_sz_cy_cz,
each_select32);
858 f128 r2 = F128::SetFloatToLane<3>(m20_01_22_11, 0.f);
859 f128 r1 = F128::Permute<3, 7, 1, 1>(m00_12_02_10, m20_01_22_11);
862 m.r[0] = F128::Permute<0, 5, 2, 7>(m00_12_02_10, r2);
863 m.r[1] = F128::SetZeroToLane<3>(r1);
864 m.r[2] = F128::SetFloatToLane<1>(r2, -F128::GetFloatFromLane<0>(sin_xyz));
865 m.r[3] = F128::Set0001();
873 SimdVector r0 = Vector3::Cross(up_dir_normalized, eye_dir_normalized);
874 SimdVector r1 = Vector3::Cross(eye_dir_normalized, r0);
877 neg = F128::SetZeroToLane<3>(neg);
878 f128 d012 = Vector4::Dot3(neg, r0, r1, eye_dir_normalized);
882 m.r[2] = eye_dir_normalized;
883 m.r[3] = F128::Set0001();
885 m.r[3] = F128::SetFloatToLane<3>(d012, 1.f);
888 f128 d0 = Vector3::Dot(r0, neg);
889 f128 d1 = Vector3::Dot(r1, neg);
890 f128 d2 = Vector3::Dot(eye_dir_normalized, neg);
892 m.r[0] = F128::Splat<false, false, false, true>(r0, d0);
893 m.r[1] = F128::Splat<false, false, false, true>(r1, d1);
894 m.r[2] = F128::Splat<false, false, false, true>(eye_dir_normalized, d2);
895 m.r[3] = F128::Set0001();
904 SimdVector eye_dir = F128::Sub(at_pos, eye_pos);
905 eye_dir = Vector3::Normalize(eye_dir);
906 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
913 return LookToLh(eye_pos, F128::Negate(eye_dir_normalized), up_dir_normalized);
920 SimdVector eye_dir = F128::Sub(eye_pos, at_pos);
921 eye_dir = Vector3::Normalize(eye_dir);
922 return LookToLh(eye_pos, eye_dir, up_dir_normalized);
927 Matrix::PerspectiveLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT {
928 float near2 = near_z + near_z;
929 float range = far_z / (far_z - near_z);
930 f128 zero = F128::SetZero();
931 f128 v = F128::SetValue(near2 / width, near2 / height, range, -range * near_z);
933 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
934 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
935 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
936 m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
937 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
943 Matrix::PerspectiveRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT {
944 float near2 = near_z + near_z;
945 float range = far_z / (near_z - far_z);
946 f128 zero = F128::SetZero();
947 f128 v = F128::SetValue(near2 / width, near2 / height, range, range * near_z);
949 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
950 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
951 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
952 m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
953 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
959 Matrix::PerspectiveFovLh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
float near_z,
961 float height = half_fovy_cos / half_fovy_sin;
962 float width = height / aspect;
963 float range = far_z / (far_z - near_z);
965 f128 zero = F128::SetZero();
966 f128 v = F128::SetValue(width, height, range, -range * near_z);
968 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
969 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
970 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
971 m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
972 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
978 Matrix::PerspectiveFovRh(
float half_fovy_sin,
float half_fovy_cos,
float aspect,
float near_z,
980 float height = half_fovy_cos / half_fovy_sin;
981 float width = height / aspect;
982 float range = far_z / (near_z - far_z);
984 f128 zero = F128::SetZero();
985 f128 v = F128::SetValue(width, height, range, range * near_z);
987 m.r[0] = F128::Splat<false, true, true, true>(v, zero);
988 m.r[1] = F128::Splat<true, false, true, true>(v, zero);
989 f128 tmp = F128::Splat<true, true, false, true>(v, zero);
990 m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
991 m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
997 Matrix::PerspectiveOffCenterLh(
float left,
float right,
float bottom,
float top,
float near_z,
999 float near2 = near_z + near_z;
1002 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
1003 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
1004 div = F128::Div(a, b);
1007 f128 zero = F128::SetZero();
1008 f128 v0 = F128::SetValue(near2, near2, -near_z, 1.f);
1009 f128 r2 = F128::SetValue(-(left + right), -(top + bottom), 1.f, 1.f);
1010 v0 = F128::Mult(v0, div);
1013 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1014 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1015 m.r[2] = F128::Mult(r2, div);
1016 m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
1022 Matrix::PerspectiveOffCenterRh(
float left,
float right,
float bottom,
float top,
float near_z,
1024 float near2 = near_z + near_z;
1027 f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
1028 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, -1.f);
1029 div = F128::Div(a, b);
1032 f128 zero = F128::SetZero();
1033 f128 v0 = F128::SetValue(near2, near2, near_z, 1.f);
1034 f128 r2 = F128::SetValue((left + right), (top + bottom), 1.f, 1.f);
1035 v0 = F128::Mult(v0, div);
1038 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1039 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1040 m.r[2] = F128::Mult(r2, div);
1041 m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
1047 Matrix::OrthographicLh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT {
1050 f128 a = F128::SetValue(2.f, 2.f, 1.f, -near_z);
1051 f128 b = F128::SetValue(width, height, far_z - near_z, far_z - near_z);
1052 div = F128::Div(a, b);
1054 f128 zero = F128::SetZero();
1057 m.r[0] = F128::Splat<false, true, true, true>(div, zero);
1058 m.r[1] = F128::Splat<true, false, true, true>(div, zero);
1059 m.r[2] = F128::Splat<true, true, false, true>(div, zero);
1060 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1061 m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1067 Matrix::OrthographicRh(
float width,
float height,
float near_z,
float far_z)
NLIB_NOEXCEPT {
1070 f128 a = F128::SetValue(2.f, 2.f, 1.f, near_z);
1071 f128 b = F128::SetValue(width, height, near_z - far_z, near_z - far_z);
1072 div = F128::Div(a, b);
1074 f128 zero = F128::SetZero();
1077 m.r[0] = F128::Splat<false, true, true, true>(div, zero);
1078 m.r[1] = F128::Splat<true, false, true, true>(div, zero);
1079 m.r[2] = F128::Splat<true, true, false, true>(div, zero);
1080 f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1081 m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1087 Matrix::OrthographicOffCenterLh(
float left,
float right,
float bottom,
float top,
float near_z,
1091 f128 a = F128::SetOne();
1092 f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
1093 div = F128::Div(a, b);
1095 f128 zero = F128::SetZero();
1096 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1097 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), -near_z, 1.f);
1098 v0 = F128::Mult(v0, div);
1101 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1102 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1103 m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
1104 m.r[3] = F128::Mult(r3, div);
1110 Matrix::OrthographicOffCenterRh(
float left,
float right,
float bottom,
float top,
float near_z,
1114 f128 a = F128::SetOne();
1115 f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, 1.f);
1116 div = F128::Div(a, b);
1118 f128 zero = F128::SetZero();
1119 f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1120 f128 r3 = F128::SetValue(-(left + right), -(top + bottom), near_z, 1.f);
1121 v0 = F128::Mult(v0, div);
1124 m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1125 m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1126 m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
1127 m.r[3] = F128::Mult(r3, div);
1134 SimdPlane plane = F128::Mult(Vector3::RecpLength(shadow_plane), shadow_plane);
1136 f128 r0 = Vector4::DotEx<true, false, false, false>(plane, light_pos);
1137 plane = F128::Negate(plane);
1138 f128 r1 = F128::RotateLeft<1>(r0);
1139 f128 r2 = F128::RotateLeft<2>(r0);
1140 f128 r3 = F128::RotateLeft<3>(r0);
1143 m.r[0] = F128::MultAdd<0>(plane, light_pos, r0,
each_select32);
1144 m.r[1] = F128::MultAdd<1>(plane, light_pos, r1,
each_select32);
1145 m.r[2] = F128::MultAdd<2>(plane, light_pos, r2,
each_select32);
1146 m.r[3] = F128::MultAdd<3>(plane, light_pos, r3,
each_select32);
1153 SimdPlane plane = F128::Mult(Vector3::RecpLength(reflection_plane), reflection_plane);
1154 f128 minus_2n = F128::Mult(-2.f, plane);
1155 minus_2n = F128::SetZeroToLane<3>(minus_2n);
1157 SimdMatrix m = Matrix::Identity();
1158 m.r[0] = F128::MultAdd<0>(plane, minus_2n, m.r[0],
each_select32);
1159 m.r[1] = F128::MultAdd<1>(plane, minus_2n, m.r[1],
each_select32);
1160 m.r[2] = F128::MultAdd<2>(plane, minus_2n, m.r[2],
each_select32);
1161 m.r[3] = F128::MultAdd<3>(plane, minus_2n, m.r[3],
each_select32);
1176 f128 dot_x = Vector3::DotEx<true, false, false, true>(m.r[0], m.r[0]);
1177 f128 dot_y = Vector3::DotEx<false, true, false, true>(m.r[1], m.r[1]);
1178 f128 dot_z = Vector3::DotEx<false, false, true, true>(m.r[2], m.r[2]);
1179 f128 dot = F128::Or(dot_x, dot_y);
1180 dot = F128::Or(dot, dot_z);
1181 recp_scale = F128::RecpSqrt(dot);
1182 *scale = F128::Mult(dot, recp_scale);
1186 rot->r[0] = F128::Mult<0>(recp_scale, m.r[0],
each_select32);
1187 rot->r[1] = F128::Mult<1>(recp_scale, m.r[1],
each_select32);
1188 rot->r[2] = F128::Mult<2>(recp_scale, m.r[2],
each_select32);
1189 rot->r[3] = F128::Set0001();
1194 #endif // NLIB_DOXYGEN 1199 #endif // INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
f128arg SimdVectorArg
f128argがtypedefされています。
#define NLIB_F128_TRANSPOSE(row0, row1, row2, row3)
インプレイスで行列を転置するためのマクロです。
constexpr const each_float_tag each_float
each_float_tag型の定数オブジェクトで、単精度浮動小数点数を示すためのタグです。
f128arg SimdQuaternionArg
f128argがtypedefされています。
f128arg SimdPlaneArg
f128argがtypedefされています。
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
単精度浮動小数点数のSIMD演算を行うためのクラスや関数が定義されています。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
4x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x3の配列で16バイ...
3x3行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x3の配列です。 ...
nlib_f128_t f128
nlib_f128_tがtypedefされています。
4x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは4x4の配列で16バイ...
3x4行列をメモリから読み出したりメモリに書き出したりするための型です。データメンバmは3x4の配列で16バイ...
f128 SimdPlane
f128がtypedefされています。平面を扱う場合に利用されます。
f128 SimdVector
f128がtypedefされています。3次元ベクトル又は4次元ベクトルを扱う場合に利用されます。 ...