nlib
SimdMatrix.h
Go to the documentation of this file.
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
18 
19 #include "nn/nlib/simd/SimdFloat.h"
23 // #include "nn/nlib/simd/SimdGeometry.h"
24 
25 NLIB_NAMESPACE_BEGIN
26 namespace simd {
27 
29  public:
30  static SimdMatrix __vectorcall LoadFloat4x4(const Float4x4* p) NLIB_NOEXCEPT;
31  static SimdMatrix __vectorcall LoadFloat3x4(const Float3x4* p) NLIB_NOEXCEPT;
32  static SimdMatrix __vectorcall LoadFloat4x3(const Float4x3* p) NLIB_NOEXCEPT;
33  static SimdMatrix __vectorcall LoadFloat3x3(const Float3x3* p) NLIB_NOEXCEPT;
34  static void __vectorcall StoreFloat4x4(Float4x4* p, SimdMatrixArg m) NLIB_NOEXCEPT;
35  static void __vectorcall StoreFloat3x4(Float3x4* p, SimdMatrixArg m) NLIB_NOEXCEPT;
36  static void __vectorcall StoreFloat4x3(Float4x3* p, SimdMatrixArg m) NLIB_NOEXCEPT;
37  static void __vectorcall StoreFloat3x3(Float3x3* p, SimdMatrixArg m) NLIB_NOEXCEPT;
38 
39  static SimdVector __vectorcall Determinant(SimdMatrixArg m) NLIB_NOEXCEPT;
40  static SimdMatrix __vectorcall Identity() NLIB_NOEXCEPT;
41  static SimdMatrix __vectorcall Inverse(SimdVector* det, SimdMatrixArg m) NLIB_NOEXCEPT;
42 
43  static bool __vectorcall IsIdentity(SimdMatrixArg m) NLIB_NOEXCEPT;
44  static bool __vectorcall IsInfinite(SimdMatrixArg m) NLIB_NOEXCEPT;
45  static bool __vectorcall IsNaN(SimdMatrixArg m) NLIB_NOEXCEPT;
46  static SimdMatrix __vectorcall Mult(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT;
47  static SimdMatrix __vectorcall Transpose(SimdMatrixArg m) NLIB_NOEXCEPT;
48  static SimdMatrix __vectorcall MultTranspose(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT;
49 
50  static SimdMatrix __vectorcall FromScaling(float scale_x, float scale_y,
51  float scale_z) NLIB_NOEXCEPT;
52  static SimdMatrix __vectorcall FromScaling(SimdVectorArg scale) NLIB_NOEXCEPT;
53  static SimdMatrix __vectorcall FromTranslation(float ofs_x, float ofs_y,
54  float ofs_z) NLIB_NOEXCEPT;
55  static SimdMatrix __vectorcall FromTranslation(SimdVectorArg ofs) NLIB_NOEXCEPT;
56 
57  static SimdMatrix __vectorcall FromRotationX(float sin_value, float cos_value) NLIB_NOEXCEPT;
58  static SimdMatrix __vectorcall FromRotationY(float sin_value, float cos_value) NLIB_NOEXCEPT;
59  static SimdMatrix __vectorcall FromRotationZ(float sin_value, float cos_value) NLIB_NOEXCEPT;
60  static SimdMatrix __vectorcall FromRotationAxisAndSinCos(SimdVectorArg axis_normalized,
61  float sin_value,
62  float cos_value) NLIB_NOEXCEPT;
63  static SimdMatrix __vectorcall FromRotationQuaternion(SimdQuaternionArg quat) NLIB_NOEXCEPT;
64  static SimdMatrix __vectorcall
65  FromRotationZXY(SimdVectorArg sin_xyz, SimdVectorArg cos_xyz) NLIB_NOEXCEPT;
66 
67  static SimdMatrix __vectorcall LookToLh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
68  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
69  static SimdMatrix __vectorcall LookAtLh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
70  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
71  static SimdMatrix __vectorcall LookToRh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
72  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
73  static SimdMatrix __vectorcall LookAtRh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
74  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
75 
76  static SimdMatrix __vectorcall
77  PerspectiveLh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT;
78  static SimdMatrix __vectorcall
79  PerspectiveRh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT;
80  static SimdMatrix __vectorcall PerspectiveFovLh(float half_fovy_sin, float half_fovy_cos,
81  float aspect, float near_z,
82  float far_z) NLIB_NOEXCEPT;
83  static SimdMatrix __vectorcall PerspectiveFovRh(float half_fovy_sin, float half_fovy_cos,
84  float aspect, float near_z,
85  float far_z) NLIB_NOEXCEPT;
86  static SimdMatrix __vectorcall PerspectiveOffCenterLh(float left, float right, float bottom,
87  float top, float near_z,
88  float far_z) NLIB_NOEXCEPT;
89  static SimdMatrix __vectorcall PerspectiveOffCenterRh(float left, float right, float bottom,
90  float top, float near_z,
91  float far_z) NLIB_NOEXCEPT;
92 
93  static SimdMatrix __vectorcall
94  OrthographicLh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT;
95  static SimdMatrix __vectorcall
96  OrthographicRh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT;
97  static SimdMatrix __vectorcall OrthographicOffCenterLh(float left, float right, float bottom,
98  float top, float near_z,
99  float far_z) NLIB_NOEXCEPT;
100  static SimdMatrix __vectorcall OrthographicOffCenterRh(float left, float right, float bottom,
101  float top, float near_z,
102  float far_z) NLIB_NOEXCEPT;
103 
104  static SimdMatrix __vectorcall
105  Shadow(SimdPlaneArg shadow_plane, SimdVector light_pos) NLIB_NOEXCEPT;
106  static SimdMatrix __vectorcall Reflect(SimdPlaneArg reflection_plane) NLIB_NOEXCEPT;
107  static void __vectorcall Decompose(SimdVector* scale, SimdMatrix* rot, SimdVector* trans,
109  // AffineTransform
110 
111  private:
112  Matrix(); // forbidden
113 };
114 
115 #ifndef NLIB_DOXYGEN
116 
117 #define NLIB_M(tp) inline tp __vectorcall
118 
119 NLIB_M(SimdMatrix) Matrix::LoadFloat4x4(const Float4x4* p) NLIB_NOEXCEPT {
120  SimdMatrix m;
121  m.r[0] = F128::LoadA16(&p->m[0][0]);
122  m.r[1] = F128::LoadA16(&p->m[1][0]);
123  m.r[2] = F128::LoadA16(&p->m[2][0]);
124  m.r[3] = F128::LoadA16(&p->m[3][0]);
125  return m;
126 }
127 
128 // Load and convert to column major order matrix
129 NLIB_M(SimdMatrix) Matrix::LoadFloat3x4(const Float3x4* p) NLIB_NOEXCEPT {
130  SimdMatrix m;
131  m.r[0] = F128::LoadA16(&p->m[0][0]);
132  m.r[1] = F128::LoadA16(&p->m[1][0]);
133  m.r[2] = F128::LoadA16(&p->m[2][0]);
134  m.r[3] = F128::Set0001();
135  NLIB_F128_TRANSPOSE(m.r[0], m.r[1], m.r[2], m.r[3]);
136  return m;
137 }
138 
139 NLIB_M(SimdMatrix) Matrix::LoadFloat4x3(const Float4x3* p) NLIB_NOEXCEPT {
140  f128 t0 = F128::LoadA16(&p->m[0][0]);
141  f128 t1 = F128::LoadA16(&p->m[1][1]);
142  f128 t2 = F128::LoadA16(&p->m[2][2]);
143  SimdMatrix m;
144  m.r[0] = F128::SetZeroToLane<3>(t0);
145  f128 tmp1 = F128::Permute<3, 4, 5, -1>(t0, t1);
146  m.r[1] = F128::SetZeroToLane<3>(tmp1);
147  f128 tmp2 = F128::Permute<2, 3, 4, -1>(t1, t2);
148  m.r[2] = F128::SetZeroToLane<3>(tmp2);
149  m.r[3] = F128::Permute<1, 2, 3, 7>(t2, F128::SetOne());
150  return m;
151 }
152 
153 NLIB_M(SimdMatrix) Matrix::LoadFloat3x3(const Float3x3* p) NLIB_NOEXCEPT {
154  f128 t0 = F128::LoadA4(&p->m[0][0]);
155  f128 t1 = F128::LoadA4(&p->m[1][0]);
156  f128 t2 = F128::LoadA4(&p->m[1][2]);
157  f128 zero = F128::SetZero();
158  SimdMatrix m;
159  m.r[0] = F128::SetZeroToLane<3>(t0);
160  m.r[1] = F128::SetZeroToLane<3>(t1);
161  m.r[2] = F128::Permute<1, 2, 3, 7>(t2, zero);
162  m.r[3] = F128::Set0001();
163  return m;
164 }
165 
166 inline void __vectorcall Matrix::StoreFloat4x4(Float4x4* p, SimdMatrixArg m) NLIB_NOEXCEPT {
167  F128::StoreA16(&p->m[0][0], m.r[0]);
168  F128::StoreA16(&p->m[1][0], m.r[1]);
169  F128::StoreA16(&p->m[2][0], m.r[2]);
170  F128::StoreA16(&p->m[3][0], m.r[3]);
171 }
172 
173 inline void __vectorcall Matrix::StoreFloat4x3(Float4x3* p, SimdMatrixArg m) NLIB_NOEXCEPT {
174  f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
175  f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
176  f128 t2 = F128::Permute<2, 4, 5, 6>(m.r[2], m.r[3]);
177  F128::StoreA16(&p->m[0][0], t0);
178  F128::StoreA16(&p->m[1][1], t1);
179  F128::StoreA16(&p->m[2][2], t2);
180 }
181 
182 inline void __vectorcall Matrix::StoreFloat3x3(Float3x3* p, SimdMatrixArg m) NLIB_NOEXCEPT {
183  f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
184  f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
185  F128::StoreA4(&p->m[0][0], t0);
186  F128::StoreA4(&p->m[1][1], t1);
187  p->m[2][2] = F128::GetFloatFromLane<2>(m.r[2]);
188 }
189 
190 // XMMatrixDeterminant
191 // r = determinant(m)
192 NLIB_M(f128) Matrix::Determinant(SimdMatrixArg m) NLIB_NOEXCEPT {
193  // a0 b0 c0 d0
194  // a1 b1 c1 d1
195  // a2 b2 c2 d2
196  // a3 b3 c3 d3
197  //
198  // a0|bcd123| - b0|acd123| + c0|abd123| + d0|abc123| -> SubAdd & Dot
199 
200  // |bcd123| = b1|cd23| - c1|bd23| + d1|bc23| -> lane0
201  // |acd123| = a1|cd23| - c1|ad23| + d1|ac23| -> lane1
202  // |abd123| = a1|bd23| - b1|ad23| + d1|ab23| -> lane2
203  // |abc123| = a1|bc23| - b1|ac23| + c1|ab23| -> lane3
204 
205  f128 c0det, c1det, c2det;
206  {
207  f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(m.r[2]);
208  f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(m.r[3]);
209 
210  f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(m.r[2]);
211  f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(m.r[3]);
212 
213  f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(m.r[2]);
214  f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(m.r[3]);
215 
216  f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
217  f128 tmp1 = F128::Mult(baaa_2, dddc_3);
218  f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
219 
220  c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
221  c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
222  c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
223  }
224 
225  f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(m.r[1]);
226  f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(m.r[1]);
227  f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(m.r[1]);
228  f128 r0x = F128::NegateEx<true, false, true, false>(m.r[0]);
229 
230  f128 det3_neg = F128::Mult(c1det, ccbb_1);
231  det3_neg = F128::MultSub(c0det, baaa_1, det3_neg);
232  det3_neg = F128::MultSub(c2det, dddc_1, det3_neg);
233  return Vector4::Dot(r0x, det3_neg);
234 }
235 
236 // XMMatrixIdentity
237 // r[0] = { 1, 0, 0, 0 }
238 // r[1] = { 0, 1, 0, 0 }
239 // r[2] = { 0, 0, 1, 0 }
240 // r[3] = { 0, 0, 0, 1 }
241 NLIB_M(SimdMatrix) Matrix::Identity() NLIB_NOEXCEPT {
242  SimdMatrix m;
243 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
244  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
245  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
246  float32x2_t x00 = vcreate_f32(0ULL);
247  m.r[0] = vcombine_f32(x10, x00);
248  m.r[1] = vcombine_f32(x01, x00);
249  m.r[2] = vcombine_f32(x00, x10);
250  m.r[3] = vcombine_f32(x00, x01);
251 #else
252  m.r[0] = F128::LoadA16(F128::v1000_);
253  m.r[1] = F128::LoadA16(F128::v0100_);
254  m.r[2] = F128::LoadA16(F128::v0010_);
255  m.r[3] = F128::LoadA16(F128::v0001_);
256 #endif
257  return m;
258 }
259 
260 // XMMatrixTranspose
261 // use NLIB_F128_TRANSPOSE(m.r[0], m.r[1], m.r[2], m.r[3]) if you can change 'm' itself
262 NLIB_M(SimdMatrix) Matrix::Transpose(SimdMatrixArg m) NLIB_NOEXCEPT {
263  SimdMatrix ret;
264  f128 r0 = m.r[0];
265  f128 r1 = m.r[1];
266  f128 r2 = m.r[2];
267  f128 r3 = m.r[3];
268  NLIB_F128_TRANSPOSE(r0, r1, r2, r3);
269  ret.r[0] = r0;
270  ret.r[1] = r1;
271  ret.r[2] = r2;
272  ret.r[3] = r3;
273  return ret;
274 }
275 
276 inline void __vectorcall Matrix::StoreFloat3x4(Float3x4* p, SimdMatrixArg m) NLIB_NOEXCEPT {
277  SimdMatrix M = Matrix::Transpose(m);
278  F128::StoreA16(&p->m[0][0], M.r[0]);
279  F128::StoreA16(&p->m[1][0], M.r[1]);
280  F128::StoreA16(&p->m[2][0], M.r[2]);
281 }
282 
283 #if 1
284 // XMMatrixInverse
285 NLIB_M(SimdMatrix) Matrix::Inverse(SimdVector* det, SimdMatrixArg m) NLIB_NOEXCEPT {
286  // faster on NEON
287  // SimdMatrix M = Transpose(m);
288  // M:
289  // a0 b0 c0 d0
290  // a1 b1 c1 d1
291  // a2 b2 c2 d2
292  // a3 b3 c3 d3
293 
294  // Inv:
295  // ( |bcd123|, -|acd123|, |abd123|, -|abc123|) / detm
296  // (-|bcd023|, |acd023|, -|abd023|, |abc023}) / detm
297  // ( |bcd012|, -|acd012|, |abd012|, -|abc012|) / detm
298  // (-|bcd013|, |acd013|, -|abd013|, |abc013|) / detm
299 
300  // row0:
301  // |bcd123| = b1|cd23| - c1|bd23| + d1|bc23| -> lane0
302  // |acd123| = a1|cd23| - c1|ad23| + d1|ac23| -> lane1
303  // |abd123| = a1|bd23| - b1|ad23| + d1|ab23| -> lane2
304  // |abc123| = a1|bc23| - b1|ac23| + c1|ab23| -> lane3
305 
306  // row1:
307  // |bcd023| = b0|cd23| - c0|bd23| + d0|bc23| -> lane0
308  // |acd023| = a0|cd23| - c0|ad23| + d0|ac23| -> lane1
309  // |abd023| = a0|bd23| - b0|ad23| + d0|ab23| -> lane2
310  // |abc023| = a0|bc23| - b0|ac23| + c0|ab23| -> lane3
311  f128 detvalue_reciprocal;
312  SimdMatrix ret;
313  f128 mydet;
314  {
315  f128 c0det, c1det, c2det;
316  {
317  f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.r[2], m.r[1]);
318  f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.r[2], m.r[1]);
319 
320  f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.r[3], m.r[2]);
321  f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.r[3], m.r[2]);
322 
323  f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.r[1], m.r[0]);
324  f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.r[1], m.r[0]);
325 
326  f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
327  f128 tmp1 = F128::Mult(baaa_2, dddc_3);
328  f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
329 
330  c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
331  c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
332  c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
333  }
334  {
335  f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.r[1], m.r[0]);
336  f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.r[2], m.r[1]);
337  f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.r[3], m.r[2]);
338 
339  f128 r0x_m = F128::Permute<0, -1, 4, -1>(m.r[0], m.r[2]);
340  f128 r0x_p = F128::Permute<-1, 0, -1, 4>(m.r[1], m.r[3]);
341  f128 r0x = F128::Permute<0, 5, 2, 7>(F128::Negate(r0x_m), r0x_p);
342 
343  f128 det3 = F128::Mult(c1det, ccbb_1);
344  det3 = F128::MultSub(c0det, baaa_1, det3);
345  det3 = F128::MultSub(c2det, dddc_1, det3);
346 
347  mydet = Vector4::Dot(r0x, det3);
348 
349  det3 = F128::NegateEx<true, false, true, false>(det3);
350  detvalue_reciprocal = F128::Recp(mydet);
351 
352  ret.r[0] = F128::Mult(detvalue_reciprocal, det3);
353  }
354  {
355  f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.r[1], m.r[0]);
356  f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.r[2], m.r[1]);
357  f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.r[3], m.r[2]);
358 
359  f128 det3 = F128::Mult(c0det, baaa_0);
360  det3 = F128::MultAdd(c2det, dddc_0, det3);
361  det3 = F128::MultSub(c1det, ccbb_0, det3);
362  det3 = F128::NegateEx<true, false, true, false>(det3);
363  ret.r[1] = F128::Mult(detvalue_reciprocal, det3);
364  }
365  }
366 
367  // row3:
368  // |bcd012| = b2|cd01| - c2|bd01| + d2|bc01| -> lane0
369  // |acd012| = a2|cd01| - c2|ad01| + d2|ac01| -> lane1
370  // |abd012| = a2|bd01| - b2|ad01| + d2|ab01| -> lane2
371  // |abc012| = a2|bc01| - b2|ac01| + c2|ab01| -> lane3
372 
373  // row2:
374  // |bcd013| = b3|cd01| - c3|bd01| + d3|bc01| -> lane0
375  // |acd013| = a3|cd01| - c3|ad01| + d3|ac01| -> lane1
376  // |abd013| = a3|bd01| - b3|ad01| + d3|ab01| -> lane2
377  // |abc013| = a3|bc01| - b3|ac01| + c3|ab01| -> lane3
378  {
379  f128 c0det, c1det, c2det;
380  {
381  f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.r[2], m.r[1]);
382  f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.r[2], m.r[1]);
383 
384  f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.r[3], m.r[2]);
385  f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.r[3], m.r[2]);
386 
387  f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.r[1], m.r[0]);
388  f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.r[1], m.r[0]);
389 
390  f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
391  f128 tmp1 = F128::Mult(baaa_0, dddc_1);
392  f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
393 
394  c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
395  c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
396  c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
397  }
398  {
399  f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.r[1], m.r[0]);
400  f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.r[2], m.r[1]);
401  f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.r[3], m.r[2]);
402 
403  f128 det3 = F128::Mult(c1det, ccbb_3);
404  det3 = F128::MultSub(c0det, baaa_3, det3);
405  det3 = F128::MultSub(c2det, dddc_3, det3);
406  det3 = F128::NegateEx<true, false, true, false>(det3);
407 
408  ret.r[2] = F128::Mult(detvalue_reciprocal, det3);
409  }
410  {
411  f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.r[1], m.r[0]);
412  f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.r[2], m.r[1]);
413  f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.r[3], m.r[2]);
414 
415  f128 det3 = F128::Mult(c0det, baaa_2);
416  det3 = F128::MultAdd(c2det, dddc_2, det3);
417  det3 = F128::MultSub(c1det, ccbb_2, det3);
418  det3 = F128::NegateEx<true, false, true, false>(det3);
419 
420  ret.r[3] = F128::Mult(detvalue_reciprocal, det3);
421  }
422  }
423  if (det) {
424  *det = mydet;
425  }
426  return ret;
427 }
428 #else
429 // XMMatrixInverse
430 NLIB_M(SimdMatrix) Matrix::Inverse(SimdVector* det, SimdMatrixArg m) NLIB_NOEXCEPT {
431  SimdMatrix M = Transpose(m);
432  // M:
433  // a0 b0 c0 d0
434  // a1 b1 c1 d1
435  // a2 b2 c2 d2
436  // a3 b3 c3 d3
437 
438  // Inv:
439  // ( |bcd123|, -|acd123|, |abd123|, -|abc123|) / detm
440  // (-|bcd023|, |acd023|, -|abd023|, |abc023}) / detm
441  // ( |bcd012|, -|acd012|, |abd012|, -|abc012|) / detm
442  // (-|bcd013|, |acd013|, -|abd013|, |abc013|) / detm
443 
444  // row0:
445  // |bcd123| = b1|cd23| - c1|bd23| + d1|bc23| -> lane0
446  // |acd123| = a1|cd23| - c1|ad23| + d1|ac23| -> lane1
447  // |abd123| = a1|bd23| - b1|ad23| + d1|ab23| -> lane2
448  // |abc123| = a1|bc23| - b1|ac23| + c1|ab23| -> lane3
449 
450  // row1:
451  // |bcd023| = b0|cd23| - c0|bd23| + d0|bc23| -> lane0
452  // |acd023| = a0|cd23| - c0|ad23| + d0|ac23| -> lane1
453  // |abd023| = a0|bd23| - b0|ad23| + d0|ab23| -> lane2
454  // |abc023| = a0|bc23| - b0|ac23| + c0|ab23| -> lane3
455  f128 detvalue_reciprocal;
456  SimdMatrix ret;
457  {
458  f128 c0det, c1det, c2det;
459  {
460  f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
461  f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
462 
463  f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
464  f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
465 
466  f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
467  f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
468 
469  f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
470  f128 tmp1 = F128::Mult(baaa_2, dddc_3);
471  f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
472 
473  c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
474  c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
475  c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
476  }
477  {
478  f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
479  f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
480  f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
481  f128 r0x = F128::NegateEx<true, false, true, false>(M.r[0]);
482 
483  f128 det3 = F128::Mult(c1det, ccbb_1);
484  det3 = F128::MultSub(c0det, baaa_1, det3);
485  det3 = F128::MultSub(c2det, dddc_1, det3);
486 
487  detvalue_reciprocal = Vector4::Dot(r0x, det3);
488  if (det) {
489  *det = detvalue_reciprocal;
490  }
491 
492  det3 = F128::NegateEx<true, false, true, false>(det3);
493  detvalue_reciprocal = F128::Recp(detvalue_reciprocal);
494 
495  ret.r[0] = F128::Mult(detvalue_reciprocal, det3);
496  }
497  {
498  f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
499  f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
500  f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
501 
502  f128 det3 = F128::Mult(c0det, baaa_0);
503  det3 = F128::MultAdd(c2det, dddc_0, det3);
504  det3 = F128::MultSub(c1det, ccbb_0, det3);
505  det3 = F128::NegateEx<true, false, true, false>(det3);
506  ret.r[1] = F128::Mult(detvalue_reciprocal, det3);
507  }
508  }
509 
510  // row3:
511  // |bcd012| = b2|cd01| - c2|bd01| + d2|bc01| -> lane0
512  // |acd012| = a2|cd01| - c2|ad01| + d2|ac01| -> lane1
513  // |abd012| = a2|bd01| - b2|ad01| + d2|ab01| -> lane2
514  // |abc012| = a2|bc01| - b2|ac01| + c2|ab01| -> lane3
515 
516  // row2:
517  // |bcd013| = b3|cd01| - c3|bd01| + d3|bc01| -> lane0
518  // |acd013| = a3|cd01| - c3|ad01| + d3|ac01| -> lane1
519  // |abd013| = a3|bd01| - b3|ad01| + d3|ab01| -> lane2
520  // |abc013| = a3|bc01| - b3|ac01| + c3|ab01| -> lane3
521  {
522  f128 c0det, c1det, c2det;
523  {
524  f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
525  f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
526 
527  f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
528  f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
529 
530  f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
531  f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
532 
533  f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
534  f128 tmp1 = F128::Mult(baaa_0, dddc_1);
535  f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
536 
537  c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
538  c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
539  c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
540  }
541  {
542  f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
543  f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
544  f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
545 
546  f128 det3 = F128::Mult(c1det, ccbb_3);
547  det3 = F128::MultSub(c0det, baaa_3, det3);
548  det3 = F128::MultSub(c2det, dddc_3, det3);
549  det3 = F128::NegateEx<true, false, true, false>(det3);
550 
551  ret.r[2] = F128::Mult(detvalue_reciprocal, det3);
552  }
553  {
554  f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
555  f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
556  f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
557 
558  f128 det3 = F128::Mult(c0det, baaa_2);
559  det3 = F128::MultAdd(c2det, dddc_2, det3);
560  det3 = F128::MultSub(c1det, ccbb_2, det3);
561  det3 = F128::NegateEx<true, false, true, false>(det3);
562 
563  ret.r[3] = F128::Mult(detvalue_reciprocal, det3);
564  }
565  }
566  return ret;
567 }
568 #endif
569 
570 // XMMatrixIsIdentity
571 // true if m is an indentity matrix
572 NLIB_M(bool) Matrix::IsIdentity(SimdMatrixArg m) NLIB_NOEXCEPT { // NOLINT
573  SimdMatrix x = Matrix::Identity();
574  f128 cmp0 = F128::CmpEq(x.r[0], m.r[0]);
575  f128 cmp1 = F128::CmpEq(x.r[1], m.r[1]);
576  f128 cmp2 = F128::CmpEq(x.r[2], m.r[2]);
577  f128 cmp3 = F128::CmpEq(x.r[3], m.r[3]);
578  cmp0 = F128::And(cmp0, cmp1);
579  cmp2 = F128::And(cmp2, cmp3);
580  cmp0 = F128::And(cmp0, cmp2);
581  return F128::IsAllMaskTrue(cmp0);
582 }
583 
584 // XMMatrixIsInfinite
585 // true if there is (i, j) which satisfies isinf(m[i][j])
586 NLIB_M(bool) Matrix::IsInfinite(SimdMatrixArg m) NLIB_NOEXCEPT { // NOLINT
587 #ifdef NLIB_F128_SIMD_NOUSE
588  f128 cmp0 = F128::IsInfinite(m.r[0]);
589  f128 cmp1 = F128::IsInfinite(m.r[1]);
590  f128 cmp2 = F128::IsInfinite(m.r[2]);
591  f128 cmp3 = F128::IsInfinite(m.r[3]);
592  cmp0 = F128::Or(cmp0, cmp1);
593  cmp2 = F128::Or(cmp2, cmp3);
594  cmp0 = F128::Or(cmp0, cmp2);
595  return !F128::IsAllMaskFalse(cmp0);
596 #else
597  f128 inf_value = F128::SetInfinity();
598  f128 cmp0 = F128::CmpEq(inf_value, F128::Abs(m.r[0]));
599  f128 cmp1 = F128::CmpEq(inf_value, F128::Abs(m.r[1]));
600  f128 cmp2 = F128::CmpEq(inf_value, F128::Abs(m.r[2]));
601  f128 cmp3 = F128::CmpEq(inf_value, F128::Abs(m.r[3]));
602  cmp0 = F128::Or(cmp0, cmp1);
603  cmp2 = F128::Or(cmp2, cmp3);
604  cmp0 = F128::Or(cmp0, cmp2);
605  return !F128::IsAllMaskFalse(cmp0);
606 #endif
607 }
608 
609 // XMMatrixIsNaN
610 // true if there is (i, j) which satisfies isnan(m[i][j])
611 NLIB_M(bool) Matrix::IsNaN(SimdMatrixArg m) NLIB_NOEXCEPT { // NOLINT
612  f128 cmp0 = F128::IsNaN(m.r[0]);
613  f128 cmp1 = F128::IsNaN(m.r[1]);
614  f128 cmp2 = F128::IsNaN(m.r[2]);
615  f128 cmp3 = F128::IsNaN(m.r[3]);
616  cmp0 = F128::Or(cmp0, cmp1);
617  cmp2 = F128::Or(cmp2, cmp3);
618  cmp0 = F128::Or(cmp0, cmp2);
619  return !F128::IsAllMaskFalse(cmp0);
620 }
621 
622 // XMMatrixMultiply
623 // r = a * b
624 NLIB_M(SimdMatrix) Matrix::Mult(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT {
625  SimdMatrix m;
626  m.r[0] = Vector4::Transform(a.r[0], b);
627  m.r[1] = Vector4::Transform(a.r[1], b);
628  m.r[2] = Vector4::Transform(a.r[2], b);
629  m.r[3] = Vector4::Transform(a.r[3], b);
630  return m;
631 }
632 
633 // XMMatrixMultiplyTranspose
634 // r = transpose(a * b)
635 NLIB_M(SimdMatrix) Matrix::MultTranspose(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT {
636  f128 r0 = Vector4::Transform(a.r[0], b);
637  f128 r1 = Vector4::Transform(a.r[1], b);
638  f128 r2 = Vector4::Transform(a.r[2], b);
639  f128 r3 = Vector4::Transform(a.r[3], b);
640  SimdMatrix ret;
641  NLIB_F128_TRANSPOSE(r0, r1, r2, r3);
642  ret.r[0] = r0;
643  ret.r[1] = r1;
644  ret.r[2] = r2;
645  ret.r[3] = r3;
646  return ret;
647 }
648 
649 // XMMatrixScaling
650 // r[0] = { x, 0, 0, 0 }
651 // r[1] = { 0, y, 0, 0 }
652 // r[2] = { 0, 0, z, 0 }
653 // r[3] = { 0, 0, 0, 1 }
654 NLIB_M(SimdMatrix) Matrix::FromScaling(float scale_x, float scale_y, float scale_z) NLIB_NOEXCEPT {
655  SimdMatrix m;
656  f128 zero = F128::SetZero();
657  m.r[0] = F128::SetFloatToLane<0>(zero, scale_x);
658  m.r[1] = F128::SetFloatToLane<1>(zero, scale_y);
659  m.r[2] = F128::SetFloatToLane<2>(zero, scale_z);
660  m.r[3] = F128::Set0001();
661  return m;
662 }
663 
664 // XMMatrixScalingFromVector
665 // r[0] = { x, 0, 0, 0 }
666 // r[1] = { 0, y, 0, 0 }
667 // r[2] = { 0, 0, z, 0 }
668 // r[3] = { 0, 0, 0, 1 }
669 NLIB_M(SimdMatrix) Matrix::FromScaling(SimdVectorArg scale) NLIB_NOEXCEPT {
670  SimdMatrix m;
671  f128 zero = F128::SetZero();
672  m.r[0] = F128::Splat<false, true, true, true>(scale, zero);
673  m.r[1] = F128::Splat<true, false, true, true>(scale, zero);
674  m.r[2] = F128::Splat<true, true, false, true>(scale, zero);
675  m.r[3] = F128::Set0001();
676  return m;
677 }
678 
679 // XMMatrixTranslation
680 // r[0] = { 1, 0, 0, 0 }
681 // r[1] = { 0, 1, 0, 0 }
682 // r[2] = { 0, 0, 1, 0 }
683 // r[3] = { x, y, z, 1 }
684 NLIB_M(SimdMatrix) Matrix::FromTranslation(float ofs_x, float ofs_y, float ofs_z) NLIB_NOEXCEPT {
685  SimdMatrix m;
686  m.r[0] = F128::Set1000();
687  m.r[1] = F128::Set0100();
688  m.r[2] = F128::Set0010();
689  m.r[3] = F128::SetValue(ofs_x, ofs_y, ofs_z, 1.f);
690  return m;
691 }
692 
693 // XMMatrixTranslationFromVector
694 // r[0] = { 1, 0, 0, 0 }
695 // r[1] = { 0, 1, 0, 0 }
696 // r[2] = { 0, 0, 1, 0 }
697 // r[3] = { x, y, z, 1 }
698 NLIB_M(SimdMatrix) Matrix::FromTranslation(SimdVectorArg ofs) NLIB_NOEXCEPT {
699  SimdMatrix m;
700  m.r[0] = F128::Set1000();
701  m.r[1] = F128::Set0100();
702  m.r[2] = F128::Set0010();
703  m.r[3] = F128::Permute<0, 1, 2, 4>(ofs, m.r[0]);
704  return m;
705 }
706 
707 // XMMatrixRotationX
708 // r[0] = { 1 0 0 0 }
709 // r[1] = { 0 c s 0 }
710 // r[2] = { 0 -s c 0 }
711 // r[3] = { 0 0 0 1 }
712 NLIB_M(SimdMatrix) Matrix::FromRotationX(float sin_value, float cos_value) NLIB_NOEXCEPT {
713  SimdMatrix m;
714  SimdVector zero = F128::SetZero();
715  f128 r1 = F128::SetFloatToLane<1>(zero, cos_value);
716  r1 = F128::SetFloatToLane<2>(r1, sin_value);
717  f128 r2 = F128::SetFloatToLane<1>(zero, -sin_value);
718  r2 = F128::SetFloatToLane<2>(r2, cos_value);
719 
720  m.r[0] = F128::Set1000();
721  m.r[1] = r1;
722  m.r[2] = r2;
723  m.r[3] = F128::Set0001();
724  return m;
725 }
726 
727 // XMMatrixRotationY
728 // r[0] = { c 0 -s 0 }
729 // r[1] = { 0 1 0 0 }
730 // r[2] = { s 0 c 0 }
731 // r[3] = { 0 0 0 1 }
732 NLIB_M(SimdMatrix) Matrix::FromRotationY(float sin_value, float cos_value) NLIB_NOEXCEPT {
733  SimdMatrix m;
734  SimdVector zero = F128::SetZero();
735  f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
736  r0 = F128::SetFloatToLane<2>(r0, -sin_value);
737  f128 r2 = F128::SetFloatToLane<0>(zero, sin_value);
738  r2 = F128::SetFloatToLane<2>(r2, cos_value);
739 
740  m.r[0] = r0;
741  m.r[1] = F128::Set0100();
742  m.r[2] = r2;
743  m.r[3] = F128::Set0001();
744  return m;
745 }
746 
747 // XMMatrixRotationZ
748 // r[0] = { c s 0 0 }
749 // r[1] = { -s c 0 0 }
750 // r[2] = { 0 0 1 0 }
751 // r[3] = { 0 0 0 1 }
752 NLIB_M(SimdMatrix) Matrix::FromRotationZ(float sin_value, float cos_value) NLIB_NOEXCEPT {
753  SimdMatrix m;
754  SimdVector zero = F128::SetZero();
755  f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
756  r0 = F128::SetFloatToLane<1>(r0, sin_value);
757  f128 r1 = F128::SetFloatToLane<0>(zero, -sin_value);
758  r1 = F128::SetFloatToLane<1>(r1, cos_value);
759 
760  m.r[0] = r0;
761  m.r[1] = r1;
762  m.r[2] = F128::Set0010();
763  m.r[3] = F128::Set0001();
764  return m;
765 }
766 
767 // XMMatrixRotationAxis
768 // The result may be different from DirectXMath's XMMatrixRotationAxis.
769 // It is because the calculation order is different.
770 NLIB_M(SimdMatrix) Matrix::FromRotationAxisAndSinCos(SimdVectorArg axis_normalized, float sin_value,
771  float cos_value) NLIB_NOEXCEPT {
772  // m00, m11, m22, *
773  f128 diagonal, c1;
774  {
775  f128 nn = F128::Mult(axis_normalized, axis_normalized);
776  f128 c = F128::SetValue(cos_value, each_float);
777  c1 = F128::SetValue(1.f - cos_value, each_float);
778  diagonal = F128::MultAdd(c1, nn, c);
779  diagonal = F128::SetZeroToLane<3>(diagonal);
780  }
781 
782  f128 zxy = F128::Swizzle<2, 0, 1, 2>(axis_normalized);
783  f128 s = F128::SetValue(sin_value, each_float);
784  f128 xy_yz_xz = F128::Mult(axis_normalized, F128::Swizzle<1, 2, 0, 3>(axis_normalized));
785  xy_yz_xz = F128::Mult(c1, xy_yz_xz);
786  f128 plus = F128::MultAdd(s, zxy, xy_yz_xz); // xy(1-c)+sz, yz(1-c)+sx, xz(1-c)+sy
787  f128 minus = F128::MultSub(s, zxy, xy_yz_xz); // xy(1-c)-sz, yz(1-c)-sx, xz(1-c)-sy
788 
789  f128 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
790  f128 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
791 
792  SimdMatrix m;
793  m.r[0] = F128::Permute<4, 0, 1, 7>(t1, diagonal);
794  m.r[1] = F128::Permute<2, 5, 0, 7>(t0, diagonal);
795  m.r[2] = F128::Permute<1, 3, 6, 7>(t0, diagonal);
796  m.r[3] = F128::Set0001();
797  return m;
798 }
799 
800 // XMMatrixRotationQuaternion
801 // the result may differ from directxmath's one because the computation sequence is different.
802 NLIB_M(SimdMatrix) Matrix::FromRotationQuaternion(SimdQuaternionArg quat) NLIB_NOEXCEPT {
803  // m00, m11, m22, *
804  f128 q2 = F128::Add(quat, quat);
805  f128 qq2 = F128::Mult(quat, q2);
806  f128 t0, t1;
807 
808  t0 = F128::Swizzle<1, 0, 0, -1>(qq2); // 2y^2, 2x^2, 2x^2, *
809  t1 = F128::Swizzle<2, 2, 1, -1>(qq2); // 2z^2, 2z^2, 2y^2, *
810  // 1-2y^2-2z^2, 1-2x^2-2z^2, 1-2x^2-2y^2, 0
811  f128 diagonal = F128::Sub(F128::Sub(F128::SetOne(), t0), t1);
812  diagonal = F128::SetFloatToLane<3>(diagonal, 0.f);
813 
814  t0 = F128::Swizzle<1, 0, 0, -1>(quat); // y, x, x, *
815  t1 = F128::Swizzle<2, 2, 1, -1>(q2); // 2z, 2z, 2y, *
816  f128 yz_xz_xy = F128::Mult(t0, t1); // 2yz, 2xz, 2xy, *
817 
818  t0 = F128::SetValue<3>(quat, each_select32); // w, w, w, *
819  f128 wx_wy_wz = F128::Mult(q2, t0); // 2wx, 2wy, 2wz
820 
821  f128 plus = F128::Add(yz_xz_xy, wx_wy_wz); // 2yz+2wx, 2xz+2wy, 2xy+2wz, *
822  f128 minus = F128::Sub(yz_xz_xy, wx_wy_wz); // 2yz-2wx, 2xz-2wy, 2xy-2wz, *
823 
824  t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
825  t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
826 
827  SimdMatrix m;
828  m.r[0] = F128::Permute<4, 1, 3, 7>(t0, diagonal);
829  m.r[1] = F128::Permute<1, 5, 0, 7>(t1, diagonal);
830  m.r[2] = F128::Permute<0, 2, 6, 7>(t0, diagonal);
831  m.r[3] = F128::Set0001();
832  return m;
833 }
834 
835 // XMMatrixRotationRollPitchYaw
836 NLIB_M(SimdMatrix) Matrix::FromRotationZXY(SimdVectorArg sin_xyz,
837  SimdVectorArg cos_xyz) NLIB_NOEXCEPT {
838  // CzCy+SzSxSy SzCx -CzSy+SzSxCy 0
839  // -SzCy+CzSxSy CzCx SzSy+CzSxCy 0
840  // CxSy -Sx CxCy 0
841  f128 m00_12_02_10;
842  {
843  f128 sz_cz_sz_cz = F128::Permute<2, 6, 2, 6>(sin_xyz, cos_xyz);
844  f128 sy_cy_cy_sy = F128::Permute<1, 5, 5, 1>(sin_xyz, cos_xyz);
845  f128 tmp = F128::Mult(sz_cz_sz_cz, sy_cy_cy_sy);
846  m00_12_02_10 = F128::Mult<0>(sin_xyz, tmp, each_select32);
847  tmp = F128::Swizzle<1, 0, 3, 2>(tmp);
848  tmp = F128::NegateEx<false, false, true, true>(tmp);
849  m00_12_02_10 = F128::Add(tmp, m00_12_02_10);
850  }
851  f128 m20_01_22_11;
852  {
853  f128 sy_sz_cy_cz = F128::Permute<1, 2, 5, 6>(sin_xyz, cos_xyz);
854  m20_01_22_11 = F128::Mult<0>(cos_xyz, sy_sz_cy_cz, each_select32);
855  }
856 
857  f128 r2 = F128::SetFloatToLane<3>(m20_01_22_11, 0.f);
858  f128 r1 = F128::Permute<3, 7, 1, 1>(m00_12_02_10, m20_01_22_11);
859 
860  SimdMatrix m;
861  m.r[0] = F128::Permute<0, 5, 2, 7>(m00_12_02_10, r2);
862  m.r[1] = F128::SetZeroToLane<3>(r1);
863  m.r[2] = F128::SetFloatToLane<1>(r2, -F128::GetFloatFromLane<0>(sin_xyz));
864  m.r[3] = F128::Set0001();
865  return m;
866 }
867 
868 // XMMatrixLookToLH
869 NLIB_M(SimdMatrix) Matrix::LookToLh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
870  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
871  SimdVector r0 = Vector3::Cross(up_dir_normalized, eye_dir_normalized);
872  SimdVector r1 = Vector3::Cross(eye_dir_normalized, r0);
873  SimdVector neg = F128::Negate(eye_pos);
874 #ifdef NLIB_NEON
875  neg = F128::SetZeroToLane<3>(neg);
876  f128 d012 = Vector4::Dot3(neg, r0, r1, eye_dir_normalized);
877  SimdMatrix m;
878  m.r[0] = r0;
879  m.r[1] = r1;
880  m.r[2] = eye_dir_normalized;
881  m.r[3] = F128::Set0001();
882  m = Transpose(m);
883  m.r[3] = F128::SetFloatToLane<3>(d012, 1.f);
884  return m;
885 #else
886  f128 d0 = Vector3::Dot(r0, neg);
887  f128 d1 = Vector3::Dot(r1, neg);
888  f128 d2 = Vector3::Dot(eye_dir_normalized, neg);
889  SimdMatrix m;
890  m.r[0] = F128::Splat<false, false, false, true>(r0, d0);
891  m.r[1] = F128::Splat<false, false, false, true>(r1, d1);
892  m.r[2] = F128::Splat<false, false, false, true>(eye_dir_normalized, d2);
893  m.r[3] = F128::Set0001();
894  return Transpose(m);
895 #endif
896 }
897 
898 // XMMatrixLookAtLH
899 NLIB_M(SimdMatrix) Matrix::LookAtLh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
900  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
901  SimdVector eye_dir = F128::Sub(at_pos, eye_pos);
902  eye_dir = Vector3::Normalize(eye_dir);
903  return LookToLh(eye_pos, eye_dir, up_dir_normalized);
904 }
905 
906 // XMMatrixLookToRH
907 NLIB_M(SimdMatrix) Matrix::LookToRh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
908  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
909  return LookToLh(eye_pos, F128::Negate(eye_dir_normalized), up_dir_normalized);
910 }
911 
912 // XMMatrixLookAtRH
913 NLIB_M(SimdMatrix) Matrix::LookAtRh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
914  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
915  SimdVector eye_dir = F128::Sub(eye_pos, at_pos);
916  eye_dir = Vector3::Normalize(eye_dir);
917  return LookToLh(eye_pos, eye_dir, up_dir_normalized);
918 }
919 
920 // XMMatrixPerspectiveLH
921 NLIB_M(SimdMatrix) Matrix::PerspectiveLh(float width, float height, float near_z,
922  float far_z) NLIB_NOEXCEPT {
923  float near2 = near_z + near_z;
924  float range = far_z / (far_z - near_z);
925  f128 zero = F128::SetZero();
926  f128 v = F128::SetValue(near2 / width, near2 / height, range, -range * near_z);
927  SimdMatrix m;
928  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
929  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
930  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
931  m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
932  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
933  return m;
934 }
935 
936 // XMMatrixPerspectiveRH
937 NLIB_M(SimdMatrix) Matrix::PerspectiveRh(float width, float height, float near_z,
938  float far_z) NLIB_NOEXCEPT {
939  float near2 = near_z + near_z;
940  float range = far_z / (near_z - far_z);
941  f128 zero = F128::SetZero();
942  f128 v = F128::SetValue(near2 / width, near2 / height, range, range * near_z);
943  SimdMatrix m;
944  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
945  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
946  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
947  m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
948  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
949  return m;
950 }
951 
952 // XMMatrixPerspectiveFovLH
953 NLIB_M(SimdMatrix) Matrix::PerspectiveFovLh(float half_fovy_sin, float half_fovy_cos, float aspect,
954  float near_z, float far_z) NLIB_NOEXCEPT {
955  float height = half_fovy_cos / half_fovy_sin;
956  float width = height / aspect;
957  float range = far_z / (far_z - near_z);
958 
959  f128 zero = F128::SetZero();
960  f128 v = F128::SetValue(width, height, range, -range * near_z);
961  SimdMatrix m;
962  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
963  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
964  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
965  m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
966  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
967  return m;
968 }
969 
970 // XMMatrixPerspectiveFovRH
971 NLIB_M(SimdMatrix) Matrix::PerspectiveFovRh(float half_fovy_sin, float half_fovy_cos, float aspect,
972  float near_z, float far_z) NLIB_NOEXCEPT {
973  float height = half_fovy_cos / half_fovy_sin;
974  float width = height / aspect;
975  float range = far_z / (near_z - far_z);
976 
977  f128 zero = F128::SetZero();
978  f128 v = F128::SetValue(width, height, range, range * near_z);
979  SimdMatrix m;
980  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
981  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
982  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
983  m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
984  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
985  return m;
986 }
987 
988 // XMMatrixPerspectiveOffCenterLH
989 NLIB_M(SimdMatrix) Matrix::PerspectiveOffCenterLh(float left, float right, float bottom, float top,
990  float near_z, float far_z) NLIB_NOEXCEPT {
991  float near2 = near_z + near_z;
992  f128 div;
993  {
994  f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
995  f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
996  div = F128::Div(a, b);
997  // recpWidth, recpHeight, range, 1.f
998  }
999  f128 zero = F128::SetZero();
1000  f128 v0 = F128::SetValue(near2, near2, -near_z, 1.f);
1001  f128 r2 = F128::SetValue(-(left + right), -(top + bottom), 1.f, 1.f);
1002  v0 = F128::Mult(v0, div);
1003 
1004  SimdMatrix m;
1005  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1006  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1007  m.r[2] = F128::Mult(r2, div);
1008  m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
1009  return m;
1010 }
1011 
1012 // XMMatrixPerspectiveOffCenterRH
1013 NLIB_M(SimdMatrix) Matrix::PerspectiveOffCenterRh(float left, float right, float bottom, float top,
1014  float near_z, float far_z) NLIB_NOEXCEPT {
1015  float near2 = near_z + near_z;
1016  f128 div;
1017  {
1018  f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
1019  f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, -1.f);
1020  div = F128::Div(a, b);
1021  // recpWidth, recpHeight, range, 1.f
1022  }
1023  f128 zero = F128::SetZero();
1024  f128 v0 = F128::SetValue(near2, near2, near_z, 1.f);
1025  f128 r2 = F128::SetValue((left + right), (top + bottom), 1.f, 1.f);
1026  v0 = F128::Mult(v0, div);
1027 
1028  SimdMatrix m;
1029  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1030  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1031  m.r[2] = F128::Mult(r2, div);
1032  m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
1033  return m;
1034 }
1035 
1036 // XMMatrixOrthographicLH
1037 NLIB_M(SimdMatrix) Matrix::OrthographicLh(float width, float height, float near_z,
1038  float far_z) NLIB_NOEXCEPT {
1039  f128 div;
1040  {
1041  f128 a = F128::SetValue(2.f, 2.f, 1.f, -near_z);
1042  f128 b = F128::SetValue(width, height, far_z - near_z, far_z - near_z);
1043  div = F128::Div(a, b);
1044  }
1045  f128 zero = F128::SetZero();
1046 
1047  SimdMatrix m;
1048  m.r[0] = F128::Splat<false, true, true, true>(div, zero);
1049  m.r[1] = F128::Splat<true, false, true, true>(div, zero);
1050  m.r[2] = F128::Splat<true, true, false, true>(div, zero);
1051  f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1052  m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1053  return m;
1054 }
1055 
1056 // XMMatrixOrthographicRH
1057 NLIB_M(SimdMatrix) Matrix::OrthographicRh(float width, float height, float near_z,
1058  float far_z) NLIB_NOEXCEPT {
1059  f128 div;
1060  {
1061  f128 a = F128::SetValue(2.f, 2.f, 1.f, near_z);
1062  f128 b = F128::SetValue(width, height, near_z - far_z, near_z - far_z);
1063  div = F128::Div(a, b);
1064  }
1065  f128 zero = F128::SetZero();
1066 
1067  SimdMatrix m;
1068  m.r[0] = F128::Splat<false, true, true, true>(div, zero);
1069  m.r[1] = F128::Splat<true, false, true, true>(div, zero);
1070  m.r[2] = F128::Splat<true, true, false, true>(div, zero);
1071  f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1072  m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1073  return m;
1074 }
1075 
1076 // XMMatrixOrthographicOffCenterLH
1077 NLIB_M(SimdMatrix) Matrix::OrthographicOffCenterLh(float left, float right, float bottom, float top,
1078  float near_z, float far_z) NLIB_NOEXCEPT {
1079  f128 div;
1080  {
1081  f128 a = F128::SetOne();
1082  f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
1083  div = F128::Div(a, b);
1084  }
1085  f128 zero = F128::SetZero();
1086  f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1087  f128 r3 = F128::SetValue(-(left + right), -(top + bottom), -near_z, 1.f);
1088  v0 = F128::Mult(v0, div);
1089 
1090  SimdMatrix m;
1091  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1092  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1093  m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
1094  m.r[3] = F128::Mult(r3, div);
1095  return m;
1096 }
1097 
1098 // XMMatrixOrthographicOffCenterRH
1099 NLIB_M(SimdMatrix) Matrix::OrthographicOffCenterRh(float left, float right, float bottom, float top,
1100  float near_z, float far_z) NLIB_NOEXCEPT {
1101  f128 div;
1102  {
1103  f128 a = F128::SetOne();
1104  f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, 1.f);
1105  div = F128::Div(a, b);
1106  }
1107  f128 zero = F128::SetZero();
1108  f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1109  f128 r3 = F128::SetValue(-(left + right), -(top + bottom), near_z, 1.f);
1110  v0 = F128::Mult(v0, div);
1111 
1112  SimdMatrix m;
1113  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1114  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1115  m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
1116  m.r[3] = F128::Mult(r3, div);
1117  return m;
1118 }
1119 
1120 // XMMatrixShadow
1121 NLIB_M(SimdMatrix) Matrix::Shadow(SimdPlaneArg shadow_plane, SimdVector light_pos) NLIB_NOEXCEPT {
1122  // Plane::Normalize(shadow_plane);
1123  SimdPlane plane = F128::Mult(Vector3::RecpLength(shadow_plane), shadow_plane);
1124  // distance from plane to the light
1125  f128 r0 = Vector4::DotEx<true, false, false, false>(plane, light_pos);
1126  plane = F128::Negate(plane);
1127  f128 r1 = F128::RotateLeft<1>(r0);
1128  f128 r2 = F128::RotateLeft<2>(r0);
1129  f128 r3 = F128::RotateLeft<3>(r0);
1130 
1131  SimdMatrix m;
1132  m.r[0] = F128::MultAdd<0>(plane, light_pos, r0, each_select32);
1133  m.r[1] = F128::MultAdd<1>(plane, light_pos, r1, each_select32);
1134  m.r[2] = F128::MultAdd<2>(plane, light_pos, r2, each_select32);
1135  m.r[3] = F128::MultAdd<3>(plane, light_pos, r3, each_select32);
1136  return m;
1137 }
1138 
1139 // XMMatrixReflect
1140 NLIB_M(SimdMatrix) Matrix::Reflect(SimdPlaneArg reflection_plane) NLIB_NOEXCEPT {
1141  // SimdPlane plane = Plane::Normalize(reflection_plane);
1142  SimdPlane plane = F128::Mult(Vector3::RecpLength(reflection_plane), reflection_plane);
1143  f128 minus_2n = F128::Mult(-2.f, plane);
1144  minus_2n = F128::SetZeroToLane<3>(minus_2n);
1145 
1146  SimdMatrix m = Matrix::Identity();
1147  m.r[0] = F128::MultAdd<0>(plane, minus_2n, m.r[0], each_select32);
1148  m.r[1] = F128::MultAdd<1>(plane, minus_2n, m.r[1], each_select32);
1149  m.r[2] = F128::MultAdd<2>(plane, minus_2n, m.r[2], each_select32);
1150  m.r[3] = F128::MultAdd<3>(plane, minus_2n, m.r[3], each_select32);
1151  return m;
1152 }
1153 
1154 // different from XMMatrixDecompose
1155 // note that scale.xyz >= 0
1156 NLIB_M(void) Matrix::Decompose(SimdVector* scale, SimdMatrix* rot, SimdVector* trans, // NOLINT
1158  // translation
1159  *trans = m.r[3];
1160 
1161  // scaling
1162  f128 recp_scale;
1163  {
1164  f128 dot_x = Vector3::DotEx<true, false, false, true>(m.r[0], m.r[0]);
1165  f128 dot_y = Vector3::DotEx<false, true, false, true>(m.r[1], m.r[1]);
1166  f128 dot_z = Vector3::DotEx<false, false, true, true>(m.r[2], m.r[2]);
1167  f128 dot = F128::Or(dot_x, dot_y);
1168  dot = F128::Or(dot, dot_z);
1169  recp_scale = F128::RecpSqrt(dot);
1170  *scale = F128::Mult(dot, recp_scale);
1171  }
1172 
1173  // rotation
1174  rot->r[0] = F128::Mult<0>(recp_scale, m.r[0], each_select32);
1175  rot->r[1] = F128::Mult<1>(recp_scale, m.r[1], each_select32);
1176  rot->r[2] = F128::Mult<2>(recp_scale, m.r[2], each_select32);
1177  rot->r[3] = F128::Set0001();
1178 }
1179 
1180 #undef NLIB_M
1181 
1182 #endif // NLIB_DOXYGEN
1183 
1184 } // namespace simd
1185 NLIB_NAMESPACE_END
1186 
1187 #endif // INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
The class with the collection of functions that handle 4x4 matrices.
Definition: SimdMatrix.h:28
Defines a quaternion.
float m[3][4]
A 2D 3x4 array.
Definition: SimdFloat.h:4315
f128arg SimdVectorArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4148
Definition: Base64.h:25
float m[4][3]
A 2D 4x3 array.
Definition: SimdFloat.h:4323
float m[4][4]
A 2D 4x4 array.
Definition: SimdFloat.h:4331
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:88
#define NLIB_F128_TRANSPOSE(row0, row1, row2, row3)
A macro for in-place matrix transposition.
Definition: SimdFloat.h:4222
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
Definition: SimdFloat.h:68
f128arg SimdQuaternionArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4150
f128arg SimdPlaneArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4152
f128 r[4]
Keeps each row of a 4x4 matrix.
Definition: SimdFloat.h:4175
The structure for keeping a 4x4 matrix.
Definition: SimdFloat.h:4159
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:105
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:63
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
Definition: SimdFloat.h:4321
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
Definition: SimdFloat.h:4306
Defines a three-dimensional vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
Definition: SimdFloat.h:71
Defines a four-dimensional vector.
float m[3][3]
A 2D 3x3 array.
Definition: SimdFloat.h:4307
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
Definition: SimdFloat.h:4329
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
Definition: SimdFloat.h:4313
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
Definition: SimdFloat.h:4151
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...
Definition: SimdFloat.h:4147