nlib
SimdMatrix.h
Go to the documentation of this file.
1 
2 /*--------------------------------------------------------------------------------*
3  Project: CrossRoad
4  Copyright (C)Nintendo All rights reserved.
5 
6  These coded instructions, statements, and computer programs contain proprietary
7  information of Nintendo and/or its licensed developers and are protected by
8  national and international copyright laws. They may not be disclosed to third
9  parties or copied or duplicated in any form, in whole or in part, without the
10  prior written consent of Nintendo.
11 
12  The content herein is highly confidential and should be handled accordingly.
13  *--------------------------------------------------------------------------------*/
14 
15 #pragma once
16 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
17 #define INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
18 
19 #include "nn/nlib/simd/SimdFloat.h"
23 // #include "nn/nlib/simd/SimdGeometry.h"
24 
25 NLIB_NAMESPACE_BEGIN
26 namespace simd {
27 
29  public:
30  static SimdMatrix __vectorcall LoadFloat4x4(const Float4x4* p) NLIB_NOEXCEPT;
31  static SimdMatrix __vectorcall LoadFloat3x4(const Float3x4* p) NLIB_NOEXCEPT;
32  static SimdMatrix __vectorcall LoadFloat4x3(const Float4x3* p) NLIB_NOEXCEPT;
33  static SimdMatrix __vectorcall LoadFloat3x3(const Float3x3* p) NLIB_NOEXCEPT;
34  static void __vectorcall StoreFloat4x4(Float4x4* p, SimdMatrixArg m) NLIB_NOEXCEPT;
35  static void __vectorcall StoreFloat3x4(Float3x4* p, SimdMatrixArg m) NLIB_NOEXCEPT;
36  static void __vectorcall StoreFloat4x3(Float4x3* p, SimdMatrixArg m) NLIB_NOEXCEPT;
37  static void __vectorcall StoreFloat3x3(Float3x3* p, SimdMatrixArg m) NLIB_NOEXCEPT;
38 
39  static SimdVector __vectorcall Determinant(SimdMatrixArg m) NLIB_NOEXCEPT;
40  static SimdMatrix __vectorcall Identity() NLIB_NOEXCEPT;
41  static SimdMatrix __vectorcall Inverse(SimdVector* det, SimdMatrixArg m) NLIB_NOEXCEPT;
42 
43  static bool __vectorcall IsIdentity(SimdMatrixArg m) NLIB_NOEXCEPT;
44  static bool __vectorcall IsInfinite(SimdMatrixArg m) NLIB_NOEXCEPT;
45  static bool __vectorcall IsNaN(SimdMatrixArg m) NLIB_NOEXCEPT;
46  static SimdMatrix __vectorcall Mult(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT;
47  static SimdMatrix __vectorcall Transpose(SimdMatrixArg m) NLIB_NOEXCEPT;
48  static SimdMatrix __vectorcall MultTranspose(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT;
49 
50  static SimdMatrix __vectorcall FromScaling(float scale_x, float scale_y,
51  float scale_z) NLIB_NOEXCEPT;
52  static SimdMatrix __vectorcall FromScaling(SimdVectorArg scale) NLIB_NOEXCEPT;
53  static SimdMatrix __vectorcall FromTranslation(float ofs_x, float ofs_y,
54  float ofs_z) NLIB_NOEXCEPT;
55  static SimdMatrix __vectorcall FromTranslation(SimdVectorArg ofs) NLIB_NOEXCEPT;
56 
57  static SimdMatrix __vectorcall FromRotationX(float sin_value, float cos_value) NLIB_NOEXCEPT;
58  static SimdMatrix __vectorcall FromRotationY(float sin_value, float cos_value) NLIB_NOEXCEPT;
59  static SimdMatrix __vectorcall FromRotationZ(float sin_value, float cos_value) NLIB_NOEXCEPT;
60  static SimdMatrix __vectorcall FromRotationAxisAndSinCos(SimdVectorArg axis_normalized,
61  float sin_value,
62  float cos_value) NLIB_NOEXCEPT;
63  static SimdMatrix __vectorcall FromRotationQuaternion(SimdQuaternionArg quat) NLIB_NOEXCEPT;
64  static SimdMatrix __vectorcall FromRotationZXY(SimdVectorArg sin_xyz,
66 
67  static SimdMatrix __vectorcall LookToLh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
68  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
69  static SimdMatrix __vectorcall LookAtLh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
70  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
71  static SimdMatrix __vectorcall LookToRh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
72  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
73  static SimdMatrix __vectorcall LookAtRh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
74  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT;
75 
76  static SimdMatrix __vectorcall PerspectiveLh(float width, float height, float near_z,
77  float far_z) NLIB_NOEXCEPT;
78  static SimdMatrix __vectorcall PerspectiveRh(float width, float height, float near_z,
79  float far_z) NLIB_NOEXCEPT;
80  static SimdMatrix __vectorcall PerspectiveFovLh(float half_fovy_sin, float half_fovy_cos,
81  float aspect, float near_z,
82  float far_z) NLIB_NOEXCEPT;
83  static SimdMatrix __vectorcall PerspectiveFovRh(float half_fovy_sin, float half_fovy_cos,
84  float aspect, float near_z,
85  float far_z) NLIB_NOEXCEPT;
86  static SimdMatrix __vectorcall PerspectiveOffCenterLh(float left, float right, float bottom,
87  float top, float near_z,
88  float far_z) NLIB_NOEXCEPT;
89  static SimdMatrix __vectorcall PerspectiveOffCenterRh(float left, float right, float bottom,
90  float top, float near_z,
91  float far_z) NLIB_NOEXCEPT;
92 
93  static SimdMatrix __vectorcall OrthographicLh(float width, float height, float near_z,
94  float far_z) NLIB_NOEXCEPT;
95  static SimdMatrix __vectorcall OrthographicRh(float width, float height, float near_z,
96  float far_z) NLIB_NOEXCEPT;
97  static SimdMatrix __vectorcall OrthographicOffCenterLh(float left, float right, float bottom,
98  float top, float near_z,
99  float far_z) NLIB_NOEXCEPT;
100  static SimdMatrix __vectorcall OrthographicOffCenterRh(float left, float right, float bottom,
101  float top, float near_z,
102  float far_z) NLIB_NOEXCEPT;
103 
104  static SimdMatrix __vectorcall Shadow(SimdPlaneArg shadow_plane,
105  SimdVector light_pos) NLIB_NOEXCEPT;
106  static SimdMatrix __vectorcall Reflect(SimdPlaneArg reflection_plane) NLIB_NOEXCEPT;
107  static void __vectorcall Decompose(SimdVector* scale, SimdMatrix* rot, SimdVector* trans,
109  // AffineTransform
110 
111  private:
112  Matrix(); // forbidden
113 };
114 
115 #ifndef NLIB_DOXYGEN
116 
117 #define NLIB_M(tp) inline tp __vectorcall
118 
119 NLIB_M(SimdMatrix) Matrix::LoadFloat4x4(const Float4x4* p) NLIB_NOEXCEPT {
120  SimdMatrix m;
121  m.r[0] = F128::LoadA16(&p->m[0][0]);
122  m.r[1] = F128::LoadA16(&p->m[1][0]);
123  m.r[2] = F128::LoadA16(&p->m[2][0]);
124  m.r[3] = F128::LoadA16(&p->m[3][0]);
125  return m;
126 }
127 
128 // Load and convert to column major order matrix
129 NLIB_M(SimdMatrix) Matrix::LoadFloat3x4(const Float3x4* p) NLIB_NOEXCEPT {
130  SimdMatrix m;
131  m.r[0] = F128::LoadA16(&p->m[0][0]);
132  m.r[1] = F128::LoadA16(&p->m[1][0]);
133  m.r[2] = F128::LoadA16(&p->m[2][0]);
134  m.r[3] = F128::Set0001();
135  NLIB_F128_TRANSPOSE(m.r[0], m.r[1], m.r[2], m.r[3]);
136  return m;
137 }
138 
139 NLIB_M(SimdMatrix) Matrix::LoadFloat4x3(const Float4x3* p) NLIB_NOEXCEPT {
140  f128 t0 = F128::LoadA16(&p->m[0][0]);
141  f128 t1 = F128::LoadA16(&p->m[1][1]);
142  f128 t2 = F128::LoadA16(&p->m[2][2]);
143  SimdMatrix m;
144  m.r[0] = F128::SetZeroToLane<3>(t0);
145  f128 tmp1 = F128::Permute<3, 4, 5, -1>(t0, t1);
146  m.r[1] = F128::SetZeroToLane<3>(tmp1);
147  f128 tmp2 = F128::Permute<2, 3, 4, -1>(t1, t2);
148  m.r[2] = F128::SetZeroToLane<3>(tmp2);
149  m.r[3] = F128::Permute<1, 2, 3, 7>(t2, F128::SetOne());
150  return m;
151 }
152 
153 NLIB_M(SimdMatrix) Matrix::LoadFloat3x3(const Float3x3* p) NLIB_NOEXCEPT {
154  f128 t0 = F128::LoadA4(&p->m[0][0]);
155  f128 t1 = F128::LoadA4(&p->m[1][0]);
156  f128 t2 = F128::LoadA4(&p->m[1][2]);
157  f128 zero = F128::SetZero();
158  SimdMatrix m;
159  m.r[0] = F128::SetZeroToLane<3>(t0);
160  m.r[1] = F128::SetZeroToLane<3>(t1);
161  m.r[2] = F128::Permute<1, 2, 3, 7>(t2, zero);
162  m.r[3] = F128::Set0001();
163  return m;
164 }
165 
166 inline void __vectorcall Matrix::StoreFloat4x4(Float4x4* p, SimdMatrixArg m) NLIB_NOEXCEPT {
167  F128::StoreA16(&p->m[0][0], m.r[0]);
168  F128::StoreA16(&p->m[1][0], m.r[1]);
169  F128::StoreA16(&p->m[2][0], m.r[2]);
170  F128::StoreA16(&p->m[3][0], m.r[3]);
171 }
172 
173 inline void __vectorcall Matrix::StoreFloat4x3(Float4x3* p, SimdMatrixArg m) NLIB_NOEXCEPT {
174  f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
175  f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
176  f128 t2 = F128::Permute<2, 4, 5, 6>(m.r[2], m.r[3]);
177  F128::StoreA16(&p->m[0][0], t0);
178  F128::StoreA16(&p->m[1][1], t1);
179  F128::StoreA16(&p->m[2][2], t2);
180 }
181 
182 inline void __vectorcall Matrix::StoreFloat3x3(Float3x3* p, SimdMatrixArg m) NLIB_NOEXCEPT {
183  f128 t0 = F128::Permute<0, 1, 2, 4>(m.r[0], m.r[1]);
184  f128 t1 = F128::Permute<1, 2, 4, 5>(m.r[1], m.r[2]);
185  F128::StoreA4(&p->m[0][0], t0);
186  F128::StoreA4(&p->m[1][1], t1);
187  p->m[2][2] = F128::GetFloatFromLane<2>(m.r[2]);
188 }
189 
190 // XMMatrixDeterminant
191 // r = determinant(m)
192 NLIB_M(f128) Matrix::Determinant(SimdMatrixArg m) NLIB_NOEXCEPT {
193  // a0 b0 c0 d0
194  // a1 b1 c1 d1
195  // a2 b2 c2 d2
196  // a3 b3 c3 d3
197  //
198  // a0|bcd123| - b0|acd123| + c0|abd123| + d0|abc123| -> SubAdd & Dot
199 
200  // |bcd123| = b1|cd23| - c1|bd23| + d1|bc23| -> lane0
201  // |acd123| = a1|cd23| - c1|ad23| + d1|ac23| -> lane1
202  // |abd123| = a1|bd23| - b1|ad23| + d1|ab23| -> lane2
203  // |abc123| = a1|bc23| - b1|ac23| + c1|ab23| -> lane3
204 
205  f128 c0det, c1det, c2det;
206  {
207  f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(m.r[2]);
208  f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(m.r[3]);
209 
210  f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(m.r[2]);
211  f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(m.r[3]);
212 
213  f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(m.r[2]);
214  f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(m.r[3]);
215 
216  f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
217  f128 tmp1 = F128::Mult(baaa_2, dddc_3);
218  f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
219 
220  c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
221  c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
222  c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
223  }
224 
225  f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(m.r[1]);
226  f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(m.r[1]);
227  f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(m.r[1]);
228  f128 r0x = F128::NegateEx<true, false, true, false>(m.r[0]);
229 
230  f128 det3_neg = F128::Mult(c1det, ccbb_1);
231  det3_neg = F128::MultSub(c0det, baaa_1, det3_neg);
232  det3_neg = F128::MultSub(c2det, dddc_1, det3_neg);
233  return Vector4::Dot(r0x, det3_neg);
234 }
235 
236 // XMMatrixIdentity
237 // r[0] = { 1, 0, 0, 0 }
238 // r[1] = { 0, 1, 0, 0 }
239 // r[2] = { 0, 0, 1, 0 }
240 // r[3] = { 0, 0, 0, 1 }
241 NLIB_M(SimdMatrix) Matrix::Identity() NLIB_NOEXCEPT {
242  SimdMatrix m;
243 #if defined(NLIB_NEON) && !defined(NLIB_F128_SIMD_NOUSE)
244  float32x2_t x10 = vcreate_f32(0x000000003F800000ULL);
245  float32x2_t x01 = vcreate_f32(0x3F80000000000000ULL);
246  float32x2_t x00 = vcreate_f32(0ULL);
247  m.r[0] = vcombine_f32(x10, x00);
248  m.r[1] = vcombine_f32(x01, x00);
249  m.r[2] = vcombine_f32(x00, x10);
250  m.r[3] = vcombine_f32(x00, x01);
251 #else
252  m.r[0] = F128::LoadA16(F128::v1000_);
253  m.r[1] = F128::LoadA16(F128::v0100_);
254  m.r[2] = F128::LoadA16(F128::v0010_);
255  m.r[3] = F128::LoadA16(F128::v0001_);
256 #endif
257  return m;
258 }
259 
260 // XMMatrixTranspose
261 // use NLIB_F128_TRANSPOSE(m.r[0], m.r[1], m.r[2], m.r[3]) if you can change 'm' itself
262 NLIB_M(SimdMatrix) Matrix::Transpose(SimdMatrixArg m) NLIB_NOEXCEPT {
263  SimdMatrix ret;
264  f128 r0 = m.r[0];
265  f128 r1 = m.r[1];
266  f128 r2 = m.r[2];
267  f128 r3 = m.r[3];
268  NLIB_F128_TRANSPOSE(r0, r1, r2, r3);
269  ret.r[0] = r0;
270  ret.r[1] = r1;
271  ret.r[2] = r2;
272  ret.r[3] = r3;
273  return ret;
274 }
275 
276 inline void __vectorcall Matrix::StoreFloat3x4(Float3x4* p, SimdMatrixArg m) NLIB_NOEXCEPT {
277  SimdMatrix M = Matrix::Transpose(m);
278  F128::StoreA16(&p->m[0][0], M.r[0]);
279  F128::StoreA16(&p->m[1][0], M.r[1]);
280  F128::StoreA16(&p->m[2][0], M.r[2]);
281 }
282 
283 #if 1
284 // XMMatrixInverse
285 NLIB_M(SimdMatrix) Matrix::Inverse(SimdVector* det, SimdMatrixArg m) NLIB_NOEXCEPT {
286  // faster on NEON
287  // SimdMatrix M = Transpose(m);
288  // M:
289  // a0 b0 c0 d0
290  // a1 b1 c1 d1
291  // a2 b2 c2 d2
292  // a3 b3 c3 d3
293 
294  // Inv:
295  // ( |bcd123|, -|acd123|, |abd123|, -|abc123|) / detm
296  // (-|bcd023|, |acd023|, -|abd023|, |abc023}) / detm
297  // ( |bcd012|, -|acd012|, |abd012|, -|abc012|) / detm
298  // (-|bcd013|, |acd013|, -|abd013|, |abc013|) / detm
299 
300  // row0:
301  // |bcd123| = b1|cd23| - c1|bd23| + d1|bc23| -> lane0
302  // |acd123| = a1|cd23| - c1|ad23| + d1|ac23| -> lane1
303  // |abd123| = a1|bd23| - b1|ad23| + d1|ab23| -> lane2
304  // |abc123| = a1|bc23| - b1|ac23| + c1|ab23| -> lane3
305 
306  // row1:
307  // |bcd023| = b0|cd23| - c0|bd23| + d0|bc23| -> lane0
308  // |acd023| = a0|cd23| - c0|ad23| + d0|ac23| -> lane1
309  // |abd023| = a0|bd23| - b0|ad23| + d0|ab23| -> lane2
310  // |abc023| = a0|bc23| - b0|ac23| + c0|ab23| -> lane3
311  f128 detvalue_reciprocal;
312  SimdMatrix ret;
313  f128 mydet;
314  {
315  f128 c0det, c1det, c2det;
316  {
317  f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.r[2], m.r[1]);
318  f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.r[2], m.r[1]);
319 
320  f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.r[3], m.r[2]);
321  f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.r[3], m.r[2]);
322 
323  f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.r[1], m.r[0]);
324  f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.r[1], m.r[0]);
325 
326  f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
327  f128 tmp1 = F128::Mult(baaa_2, dddc_3);
328  f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
329 
330  c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
331  c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
332  c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
333  }
334  {
335  f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.r[1], m.r[0]);
336  f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.r[2], m.r[1]);
337  f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.r[3], m.r[2]);
338 
339  f128 r0x_m = F128::Permute<0, -1, 4, -1>(m.r[0], m.r[2]);
340  f128 r0x_p = F128::Permute<-1, 0, -1, 4>(m.r[1], m.r[3]);
341  f128 r0x = F128::Permute<0, 5, 2, 7>(F128::Negate(r0x_m), r0x_p);
342 
343  f128 det3 = F128::Mult(c1det, ccbb_1);
344  det3 = F128::MultSub(c0det, baaa_1, det3);
345  det3 = F128::MultSub(c2det, dddc_1, det3);
346 
347  mydet = Vector4::Dot(r0x, det3);
348 
349  det3 = F128::NegateEx<true, false, true, false>(det3);
350  detvalue_reciprocal = F128::Recp(mydet);
351 
352  ret.r[0] = F128::Mult(detvalue_reciprocal, det3);
353  }
354  {
355  f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.r[1], m.r[0]);
356  f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.r[2], m.r[1]);
357  f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.r[3], m.r[2]);
358 
359  f128 det3 = F128::Mult(c0det, baaa_0);
360  det3 = F128::MultAdd(c2det, dddc_0, det3);
361  det3 = F128::MultSub(c1det, ccbb_0, det3);
362  det3 = F128::NegateEx<true, false, true, false>(det3);
363  ret.r[1] = F128::Mult(detvalue_reciprocal, det3);
364  }
365  }
366 
367  // row3:
368  // |bcd012| = b2|cd01| - c2|bd01| + d2|bc01| -> lane0
369  // |acd012| = a2|cd01| - c2|ad01| + d2|ac01| -> lane1
370  // |abd012| = a2|bd01| - b2|ad01| + d2|ab01| -> lane2
371  // |abc012| = a2|bc01| - b2|ac01| + c2|ab01| -> lane3
372 
373  // row2:
374  // |bcd013| = b3|cd01| - c3|bd01| + d3|bc01| -> lane0
375  // |acd013| = a3|cd01| - c3|ad01| + d3|ac01| -> lane1
376  // |abd013| = a3|bd01| - b3|ad01| + d3|ab01| -> lane2
377  // |abc013| = a3|bc01| - b3|ac01| + c3|ab01| -> lane3
378  {
379  f128 c0det, c1det, c2det;
380  {
381  f128 ccbb_0 = F128::Permute<0, 0, 4, 4>(m.r[2], m.r[1]);
382  f128 ccbb_1 = F128::Permute<1, 1, 5, 5>(m.r[2], m.r[1]);
383 
384  f128 dddc_0 = F128::Permute<0, 0, 0, 4>(m.r[3], m.r[2]);
385  f128 dddc_1 = F128::Permute<1, 1, 1, 5>(m.r[3], m.r[2]);
386 
387  f128 baaa_0 = F128::Permute<0, 4, 4, 4>(m.r[1], m.r[0]);
388  f128 baaa_1 = F128::Permute<1, 5, 5, 5>(m.r[1], m.r[0]);
389 
390  f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
391  f128 tmp1 = F128::Mult(baaa_0, dddc_1);
392  f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
393 
394  c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
395  c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
396  c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
397  }
398  {
399  f128 baaa_3 = F128::Permute<3, 7, 7, 7>(m.r[1], m.r[0]);
400  f128 ccbb_3 = F128::Permute<3, 3, 7, 7>(m.r[2], m.r[1]);
401  f128 dddc_3 = F128::Permute<3, 3, 3, 7>(m.r[3], m.r[2]);
402 
403  f128 det3 = F128::Mult(c1det, ccbb_3);
404  det3 = F128::MultSub(c0det, baaa_3, det3);
405  det3 = F128::MultSub(c2det, dddc_3, det3);
406  det3 = F128::NegateEx<true, false, true, false>(det3);
407 
408  ret.r[2] = F128::Mult(detvalue_reciprocal, det3);
409  }
410  {
411  f128 baaa_2 = F128::Permute<2, 6, 6, 6>(m.r[1], m.r[0]);
412  f128 ccbb_2 = F128::Permute<2, 2, 6, 6>(m.r[2], m.r[1]);
413  f128 dddc_2 = F128::Permute<2, 2, 2, 6>(m.r[3], m.r[2]);
414 
415  f128 det3 = F128::Mult(c0det, baaa_2);
416  det3 = F128::MultAdd(c2det, dddc_2, det3);
417  det3 = F128::MultSub(c1det, ccbb_2, det3);
418  det3 = F128::NegateEx<true, false, true, false>(det3);
419 
420  ret.r[3] = F128::Mult(detvalue_reciprocal, det3);
421  }
422  }
423  if (det) {
424  *det = mydet;
425  }
426  return ret;
427 }
428 #else
429 // XMMatrixInverse
430 NLIB_M(SimdMatrix) Matrix::Inverse(SimdVector* det, SimdMatrixArg m) NLIB_NOEXCEPT {
431  SimdMatrix M = Transpose(m);
432  // M:
433  // a0 b0 c0 d0
434  // a1 b1 c1 d1
435  // a2 b2 c2 d2
436  // a3 b3 c3 d3
437 
438  // Inv:
439  // ( |bcd123|, -|acd123|, |abd123|, -|abc123|) / detm
440  // (-|bcd023|, |acd023|, -|abd023|, |abc023}) / detm
441  // ( |bcd012|, -|acd012|, |abd012|, -|abc012|) / detm
442  // (-|bcd013|, |acd013|, -|abd013|, |abc013|) / detm
443 
444  // row0:
445  // |bcd123| = b1|cd23| - c1|bd23| + d1|bc23| -> lane0
446  // |acd123| = a1|cd23| - c1|ad23| + d1|ac23| -> lane1
447  // |abd123| = a1|bd23| - b1|ad23| + d1|ab23| -> lane2
448  // |abc123| = a1|bc23| - b1|ac23| + c1|ab23| -> lane3
449 
450  // row1:
451  // |bcd023| = b0|cd23| - c0|bd23| + d0|bc23| -> lane0
452  // |acd023| = a0|cd23| - c0|ad23| + d0|ac23| -> lane1
453  // |abd023| = a0|bd23| - b0|ad23| + d0|ab23| -> lane2
454  // |abc023| = a0|bc23| - b0|ac23| + c0|ab23| -> lane3
455  f128 detvalue_reciprocal;
456  SimdMatrix ret;
457  {
458  f128 c0det, c1det, c2det;
459  {
460  f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
461  f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
462 
463  f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
464  f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
465 
466  f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
467  f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
468 
469  f128 tmp0 = F128::Mult(ccbb_2, dddc_3);
470  f128 tmp1 = F128::Mult(baaa_2, dddc_3);
471  f128 tmp2 = F128::Mult(baaa_2, ccbb_3);
472 
473  c0det = F128::MultSub(dddc_2, ccbb_3, tmp0);
474  c1det = F128::MultSub(dddc_2, baaa_3, tmp1);
475  c2det = F128::MultSub(ccbb_2, baaa_3, tmp2);
476  }
477  {
478  f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
479  f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
480  f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
481  f128 r0x = F128::NegateEx<true, false, true, false>(M.r[0]);
482 
483  f128 det3 = F128::Mult(c1det, ccbb_1);
484  det3 = F128::MultSub(c0det, baaa_1, det3);
485  det3 = F128::MultSub(c2det, dddc_1, det3);
486 
487  detvalue_reciprocal = Vector4::Dot(r0x, det3);
488  if (det) {
489  *det = detvalue_reciprocal;
490  }
491 
492  det3 = F128::NegateEx<true, false, true, false>(det3);
493  detvalue_reciprocal = F128::Recp(detvalue_reciprocal);
494 
495  ret.r[0] = F128::Mult(detvalue_reciprocal, det3);
496  }
497  {
498  f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
499  f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
500  f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
501 
502  f128 det3 = F128::Mult(c0det, baaa_0);
503  det3 = F128::MultAdd(c2det, dddc_0, det3);
504  det3 = F128::MultSub(c1det, ccbb_0, det3);
505  det3 = F128::NegateEx<true, false, true, false>(det3);
506  ret.r[1] = F128::Mult(detvalue_reciprocal, det3);
507  }
508  }
509 
510  // row3:
511  // |bcd012| = b2|cd01| - c2|bd01| + d2|bc01| -> lane0
512  // |acd012| = a2|cd01| - c2|ad01| + d2|ac01| -> lane1
513  // |abd012| = a2|bd01| - b2|ad01| + d2|ab01| -> lane2
514  // |abc012| = a2|bc01| - b2|ac01| + c2|ab01| -> lane3
515 
516  // row2:
517  // |bcd013| = b3|cd01| - c3|bd01| + d3|bc01| -> lane0
518  // |acd013| = a3|cd01| - c3|ad01| + d3|ac01| -> lane1
519  // |abd013| = a3|bd01| - b3|ad01| + d3|ab01| -> lane2
520  // |abc013| = a3|bc01| - b3|ac01| + c3|ab01| -> lane3
521  {
522  f128 c0det, c1det, c2det;
523  {
524  f128 ccbb_0 = F128::Swizzle<2, 2, 1, 1>(M.r[0]);
525  f128 ccbb_1 = F128::Swizzle<2, 2, 1, 1>(M.r[1]);
526 
527  f128 dddc_0 = F128::Swizzle<3, 3, 3, 2>(M.r[0]);
528  f128 dddc_1 = F128::Swizzle<3, 3, 3, 2>(M.r[1]);
529 
530  f128 baaa_0 = F128::Swizzle<1, 0, 0, 0>(M.r[0]);
531  f128 baaa_1 = F128::Swizzle<1, 0, 0, 0>(M.r[1]);
532 
533  f128 tmp0 = F128::Mult(ccbb_0, dddc_1);
534  f128 tmp1 = F128::Mult(baaa_0, dddc_1);
535  f128 tmp2 = F128::Mult(baaa_0, ccbb_1);
536 
537  c0det = F128::MultSub(dddc_0, ccbb_1, tmp0);
538  c1det = F128::MultSub(dddc_0, baaa_1, tmp1);
539  c2det = F128::MultSub(ccbb_0, baaa_1, tmp2);
540  }
541  {
542  f128 baaa_3 = F128::Swizzle<1, 0, 0, 0>(M.r[3]);
543  f128 ccbb_3 = F128::Swizzle<2, 2, 1, 1>(M.r[3]);
544  f128 dddc_3 = F128::Swizzle<3, 3, 3, 2>(M.r[3]);
545 
546  f128 det3 = F128::Mult(c1det, ccbb_3);
547  det3 = F128::MultSub(c0det, baaa_3, det3);
548  det3 = F128::MultSub(c2det, dddc_3, det3);
549  det3 = F128::NegateEx<true, false, true, false>(det3);
550 
551  ret.r[2] = F128::Mult(detvalue_reciprocal, det3);
552  }
553  {
554  f128 baaa_2 = F128::Swizzle<1, 0, 0, 0>(M.r[2]);
555  f128 ccbb_2 = F128::Swizzle<2, 2, 1, 1>(M.r[2]);
556  f128 dddc_2 = F128::Swizzle<3, 3, 3, 2>(M.r[2]);
557 
558  f128 det3 = F128::Mult(c0det, baaa_2);
559  det3 = F128::MultAdd(c2det, dddc_2, det3);
560  det3 = F128::MultSub(c1det, ccbb_2, det3);
561  det3 = F128::NegateEx<true, false, true, false>(det3);
562 
563  ret.r[3] = F128::Mult(detvalue_reciprocal, det3);
564  }
565  }
566  return ret;
567 }
568 #endif
569 
570 // XMMatrixIsIdentity
571 // true if m is an indentity matrix
572 NLIB_M(bool) Matrix::IsIdentity(SimdMatrixArg m) NLIB_NOEXCEPT {
573  SimdMatrix x = Matrix::Identity();
574  f128 cmp0 = F128::CmpEq(x.r[0], m.r[0]);
575  f128 cmp1 = F128::CmpEq(x.r[1], m.r[1]);
576  f128 cmp2 = F128::CmpEq(x.r[2], m.r[2]);
577  f128 cmp3 = F128::CmpEq(x.r[3], m.r[3]);
578  cmp0 = F128::And(cmp0, cmp1);
579  cmp2 = F128::And(cmp2, cmp3);
580  cmp0 = F128::And(cmp0, cmp2);
581  return F128::IsAllMaskTrue(cmp0);
582 }
583 
584 // XMMatrixIsInfinite
585 // true if there is (i, j) which satisfies isinf(m[i][j])
586 NLIB_M(bool) Matrix::IsInfinite(SimdMatrixArg m) NLIB_NOEXCEPT {
587 #ifdef NLIB_F128_SIMD_NOUSE
588  f128 cmp0 = F128::IsInfinite(m.r[0]);
589  f128 cmp1 = F128::IsInfinite(m.r[1]);
590  f128 cmp2 = F128::IsInfinite(m.r[2]);
591  f128 cmp3 = F128::IsInfinite(m.r[3]);
592  cmp0 = F128::Or(cmp0, cmp1);
593  cmp2 = F128::Or(cmp2, cmp3);
594  cmp0 = F128::Or(cmp0, cmp2);
595  return !F128::IsAllMaskFalse(cmp0);
596 #else
597  f128 inf_value = F128::SetInfinity();
598  f128 cmp0 = F128::CmpEq(inf_value, F128::Abs(m.r[0]));
599  f128 cmp1 = F128::CmpEq(inf_value, F128::Abs(m.r[1]));
600  f128 cmp2 = F128::CmpEq(inf_value, F128::Abs(m.r[2]));
601  f128 cmp3 = F128::CmpEq(inf_value, F128::Abs(m.r[3]));
602  cmp0 = F128::Or(cmp0, cmp1);
603  cmp2 = F128::Or(cmp2, cmp3);
604  cmp0 = F128::Or(cmp0, cmp2);
605  return !F128::IsAllMaskFalse(cmp0);
606 #endif
607 }
608 
609 // XMMatrixIsNaN
610 // true if there is (i, j) which satisfies isnan(m[i][j])
611 NLIB_M(bool) Matrix::IsNaN(SimdMatrixArg m) NLIB_NOEXCEPT {
612  f128 cmp0 = F128::IsNaN(m.r[0]);
613  f128 cmp1 = F128::IsNaN(m.r[1]);
614  f128 cmp2 = F128::IsNaN(m.r[2]);
615  f128 cmp3 = F128::IsNaN(m.r[3]);
616  cmp0 = F128::Or(cmp0, cmp1);
617  cmp2 = F128::Or(cmp2, cmp3);
618  cmp0 = F128::Or(cmp0, cmp2);
619  return !F128::IsAllMaskFalse(cmp0);
620 }
621 
622 // XMMatrixMultiply
623 // r = a * b
624 NLIB_M(SimdMatrix) Matrix::Mult(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT {
625  SimdMatrix m;
626  m.r[0] = Vector4::Transform(a.r[0], b);
627  m.r[1] = Vector4::Transform(a.r[1], b);
628  m.r[2] = Vector4::Transform(a.r[2], b);
629  m.r[3] = Vector4::Transform(a.r[3], b);
630  return m;
631 }
632 
633 // XMMatrixMultiplyTranspose
634 // r = transpose(a * b)
635 NLIB_M(SimdMatrix) Matrix::MultTranspose(SimdMatrixArg a, SimdMatrixArg b) NLIB_NOEXCEPT {
636  f128 r0 = Vector4::Transform(a.r[0], b);
637  f128 r1 = Vector4::Transform(a.r[1], b);
638  f128 r2 = Vector4::Transform(a.r[2], b);
639  f128 r3 = Vector4::Transform(a.r[3], b);
640  SimdMatrix ret;
641  NLIB_F128_TRANSPOSE(r0, r1, r2, r3);
642  ret.r[0] = r0;
643  ret.r[1] = r1;
644  ret.r[2] = r2;
645  ret.r[3] = r3;
646  return ret;
647 }
648 
649 // XMMatrixScaling
650 // r[0] = { x, 0, 0, 0 }
651 // r[1] = { 0, y, 0, 0 }
652 // r[2] = { 0, 0, z, 0 }
653 // r[3] = { 0, 0, 0, 1 }
654 NLIB_M(SimdMatrix) Matrix::FromScaling(float scale_x, float scale_y, float scale_z) NLIB_NOEXCEPT {
655  SimdMatrix m;
656  f128 zero = F128::SetZero();
657  m.r[0] = F128::SetFloatToLane<0>(zero, scale_x);
658  m.r[1] = F128::SetFloatToLane<1>(zero, scale_y);
659  m.r[2] = F128::SetFloatToLane<2>(zero, scale_z);
660  m.r[3] = F128::Set0001();
661  return m;
662 }
663 
664 // XMMatrixScalingFromVector
665 // r[0] = { x, 0, 0, 0 }
666 // r[1] = { 0, y, 0, 0 }
667 // r[2] = { 0, 0, z, 0 }
668 // r[3] = { 0, 0, 0, 1 }
669 NLIB_M(SimdMatrix) Matrix::FromScaling(SimdVectorArg scale) NLIB_NOEXCEPT {
670  SimdMatrix m;
671  f128 zero = F128::SetZero();
672  m.r[0] = F128::Splat<false, true, true, true>(scale, zero);
673  m.r[1] = F128::Splat<true, false, true, true>(scale, zero);
674  m.r[2] = F128::Splat<true, true, false, true>(scale, zero);
675  m.r[3] = F128::Set0001();
676  return m;
677 }
678 
679 // XMMatrixTranslation
680 // r[0] = { 1, 0, 0, 0 }
681 // r[1] = { 0, 1, 0, 0 }
682 // r[2] = { 0, 0, 1, 0 }
683 // r[3] = { x, y, z, 1 }
684 NLIB_M(SimdMatrix) Matrix::FromTranslation(float ofs_x, float ofs_y, float ofs_z) NLIB_NOEXCEPT {
685  SimdMatrix m;
686  m.r[0] = F128::Set1000();
687  m.r[1] = F128::Set0100();
688  m.r[2] = F128::Set0010();
689  m.r[3] = F128::SetValue(ofs_x, ofs_y, ofs_z, 1.f);
690  return m;
691 }
692 
693 // XMMatrixTranslationFromVector
694 // r[0] = { 1, 0, 0, 0 }
695 // r[1] = { 0, 1, 0, 0 }
696 // r[2] = { 0, 0, 1, 0 }
697 // r[3] = { x, y, z, 1 }
698 NLIB_M(SimdMatrix) Matrix::FromTranslation(SimdVectorArg ofs) NLIB_NOEXCEPT {
699  SimdMatrix m;
700  m.r[0] = F128::Set1000();
701  m.r[1] = F128::Set0100();
702  m.r[2] = F128::Set0010();
703  m.r[3] = F128::Permute<0, 1, 2, 4>(ofs, m.r[0]);
704  return m;
705 }
706 
707 // XMMatrixRotationX
708 // r[0] = { 1 0 0 0 }
709 // r[1] = { 0 c s 0 }
710 // r[2] = { 0 -s c 0 }
711 // r[3] = { 0 0 0 1 }
712 NLIB_M(SimdMatrix) Matrix::FromRotationX(float sin_value, float cos_value) NLIB_NOEXCEPT {
713  SimdMatrix m;
714  SimdVector zero = F128::SetZero();
715  f128 r1 = F128::SetFloatToLane<1>(zero, cos_value);
716  r1 = F128::SetFloatToLane<2>(r1, sin_value);
717  f128 r2 = F128::SetFloatToLane<1>(zero, -sin_value);
718  r2 = F128::SetFloatToLane<2>(r2, cos_value);
719 
720  m.r[0] = F128::Set1000();
721  m.r[1] = r1;
722  m.r[2] = r2;
723  m.r[3] = F128::Set0001();
724  return m;
725 }
726 
727 // XMMatrixRotationY
728 // r[0] = { c 0 -s 0 }
729 // r[1] = { 0 1 0 0 }
730 // r[2] = { s 0 c 0 }
731 // r[3] = { 0 0 0 1 }
732 NLIB_M(SimdMatrix) Matrix::FromRotationY(float sin_value, float cos_value) NLIB_NOEXCEPT {
733  SimdMatrix m;
734  SimdVector zero = F128::SetZero();
735  f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
736  r0 = F128::SetFloatToLane<2>(r0, -sin_value);
737  f128 r2 = F128::SetFloatToLane<0>(zero, sin_value);
738  r2 = F128::SetFloatToLane<2>(r2, cos_value);
739 
740  m.r[0] = r0;
741  m.r[1] = F128::Set0100();
742  m.r[2] = r2;
743  m.r[3] = F128::Set0001();
744  return m;
745 }
746 
747 // XMMatrixRotationZ
748 // r[0] = { c s 0 0 }
749 // r[1] = { -s c 0 0 }
750 // r[2] = { 0 0 1 0 }
751 // r[3] = { 0 0 0 1 }
752 NLIB_M(SimdMatrix) Matrix::FromRotationZ(float sin_value, float cos_value) NLIB_NOEXCEPT {
753  SimdMatrix m;
754  SimdVector zero = F128::SetZero();
755  f128 r0 = F128::SetFloatToLane<0>(zero, cos_value);
756  r0 = F128::SetFloatToLane<1>(r0, sin_value);
757  f128 r1 = F128::SetFloatToLane<0>(zero, -sin_value);
758  r1 = F128::SetFloatToLane<1>(r1, cos_value);
759 
760  m.r[0] = r0;
761  m.r[1] = r1;
762  m.r[2] = F128::Set0010();
763  m.r[3] = F128::Set0001();
764  return m;
765 }
766 
767 // XMMatrixRotationAxis
768 // The result may be different from DirectXMath's XMMatrixRotationAxis.
769 // It is because the calculation order is different.
770 NLIB_M(SimdMatrix)
771 Matrix::FromRotationAxisAndSinCos(SimdVectorArg axis_normalized, float sin_value,
772  float cos_value) NLIB_NOEXCEPT {
773  // m00, m11, m22, *
774  f128 diagonal, c1;
775  {
776  f128 nn = F128::Mult(axis_normalized, axis_normalized);
777  f128 c = F128::SetValue(cos_value, each_float);
778  c1 = F128::SetValue(1.f - cos_value, each_float);
779  diagonal = F128::MultAdd(c1, nn, c);
780  diagonal = F128::SetZeroToLane<3>(diagonal);
781  }
782 
783  f128 zxy = F128::Swizzle<2, 0, 1, 2>(axis_normalized);
784  f128 s = F128::SetValue(sin_value, each_float);
785  f128 xy_yz_xz = F128::Mult(axis_normalized, F128::Swizzle<1, 2, 0, 3>(axis_normalized));
786  xy_yz_xz = F128::Mult(c1, xy_yz_xz);
787  f128 plus = F128::MultAdd(s, zxy, xy_yz_xz); // xy(1-c)+sz, yz(1-c)+sx, xz(1-c)+sy
788  f128 minus = F128::MultSub(s, zxy, xy_yz_xz); // xy(1-c)-sz, yz(1-c)-sx, xz(1-c)-sy
789 
790  f128 t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
791  f128 t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
792 
793  SimdMatrix m;
794  m.r[0] = F128::Permute<4, 0, 1, 7>(t1, diagonal);
795  m.r[1] = F128::Permute<2, 5, 0, 7>(t0, diagonal);
796  m.r[2] = F128::Permute<1, 3, 6, 7>(t0, diagonal);
797  m.r[3] = F128::Set0001();
798  return m;
799 }
800 
801 // XMMatrixRotationQuaternion
802 // the result may differ from directxmath's one because the computation sequence is different.
803 NLIB_M(SimdMatrix) Matrix::FromRotationQuaternion(SimdQuaternionArg quat) NLIB_NOEXCEPT {
804  // m00, m11, m22, *
805  f128 q2 = F128::Add(quat, quat);
806  f128 qq2 = F128::Mult(quat, q2);
807  f128 t0, t1;
808 
809  t0 = F128::Swizzle<1, 0, 0, -1>(qq2); // 2y^2, 2x^2, 2x^2, *
810  t1 = F128::Swizzle<2, 2, 1, -1>(qq2); // 2z^2, 2z^2, 2y^2, *
811  // 1-2y^2-2z^2, 1-2x^2-2z^2, 1-2x^2-2y^2, 0
812  f128 diagonal = F128::Sub(F128::Sub(F128::SetOne(), t0), t1);
813  diagonal = F128::SetFloatToLane<3>(diagonal, 0.f);
814 
815  t0 = F128::Swizzle<1, 0, 0, -1>(quat); // y, x, x, *
816  t1 = F128::Swizzle<2, 2, 1, -1>(q2); // 2z, 2z, 2y, *
817  f128 yz_xz_xy = F128::Mult(t0, t1); // 2yz, 2xz, 2xy, *
818 
819  t0 = F128::SetValue<3>(quat, each_select32); // w, w, w, *
820  f128 wx_wy_wz = F128::Mult(q2, t0); // 2wx, 2wy, 2wz
821 
822  f128 plus = F128::Add(yz_xz_xy, wx_wy_wz); // 2yz+2wx, 2xz+2wy, 2xy+2wz, *
823  f128 minus = F128::Sub(yz_xz_xy, wx_wy_wz); // 2yz-2wx, 2xz-2wy, 2xy-2wz, *
824 
825  t0 = F128::Permute<1, 2, 4, 5>(plus, minus);
826  t1 = F128::Permute<0, 6, 2, 3>(plus, minus);
827 
828  SimdMatrix m;
829  m.r[0] = F128::Permute<4, 1, 3, 7>(t0, diagonal);
830  m.r[1] = F128::Permute<1, 5, 0, 7>(t1, diagonal);
831  m.r[2] = F128::Permute<0, 2, 6, 7>(t0, diagonal);
832  m.r[3] = F128::Set0001();
833  return m;
834 }
835 
836 // XMMatrixRotationRollPitchYaw
837 NLIB_M(SimdMatrix)
838 Matrix::FromRotationZXY(SimdVectorArg sin_xyz, SimdVectorArg cos_xyz) NLIB_NOEXCEPT {
839  // CzCy+SzSxSy SzCx -CzSy+SzSxCy 0
840  // -SzCy+CzSxSy CzCx SzSy+CzSxCy 0
841  // CxSy -Sx CxCy 0
842  f128 m00_12_02_10;
843  {
844  f128 sz_cz_sz_cz = F128::Permute<2, 6, 2, 6>(sin_xyz, cos_xyz);
845  f128 sy_cy_cy_sy = F128::Permute<1, 5, 5, 1>(sin_xyz, cos_xyz);
846  f128 tmp = F128::Mult(sz_cz_sz_cz, sy_cy_cy_sy);
847  m00_12_02_10 = F128::Mult<0>(sin_xyz, tmp, each_select32);
848  tmp = F128::Swizzle<1, 0, 3, 2>(tmp);
849  tmp = F128::NegateEx<false, false, true, true>(tmp);
850  m00_12_02_10 = F128::Add(tmp, m00_12_02_10);
851  }
852  f128 m20_01_22_11;
853  {
854  f128 sy_sz_cy_cz = F128::Permute<1, 2, 5, 6>(sin_xyz, cos_xyz);
855  m20_01_22_11 = F128::Mult<0>(cos_xyz, sy_sz_cy_cz, each_select32);
856  }
857 
858  f128 r2 = F128::SetFloatToLane<3>(m20_01_22_11, 0.f);
859  f128 r1 = F128::Permute<3, 7, 1, 1>(m00_12_02_10, m20_01_22_11);
860 
861  SimdMatrix m;
862  m.r[0] = F128::Permute<0, 5, 2, 7>(m00_12_02_10, r2);
863  m.r[1] = F128::SetZeroToLane<3>(r1);
864  m.r[2] = F128::SetFloatToLane<1>(r2, -F128::GetFloatFromLane<0>(sin_xyz));
865  m.r[3] = F128::Set0001();
866  return m;
867 }
868 
869 // XMMatrixLookToLH
870 NLIB_M(SimdMatrix)
871 Matrix::LookToLh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
872  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
873  SimdVector r0 = Vector3::Cross(up_dir_normalized, eye_dir_normalized);
874  SimdVector r1 = Vector3::Cross(eye_dir_normalized, r0);
875  SimdVector neg = F128::Negate(eye_pos);
876 #ifdef NLIB_NEON
877  neg = F128::SetZeroToLane<3>(neg);
878  f128 d012 = Vector4::Dot3(neg, r0, r1, eye_dir_normalized);
879  SimdMatrix m;
880  m.r[0] = r0;
881  m.r[1] = r1;
882  m.r[2] = eye_dir_normalized;
883  m.r[3] = F128::Set0001();
884  m = Transpose(m);
885  m.r[3] = F128::SetFloatToLane<3>(d012, 1.f);
886  return m;
887 #else
888  f128 d0 = Vector3::Dot(r0, neg);
889  f128 d1 = Vector3::Dot(r1, neg);
890  f128 d2 = Vector3::Dot(eye_dir_normalized, neg);
891  SimdMatrix m;
892  m.r[0] = F128::Splat<false, false, false, true>(r0, d0);
893  m.r[1] = F128::Splat<false, false, false, true>(r1, d1);
894  m.r[2] = F128::Splat<false, false, false, true>(eye_dir_normalized, d2);
895  m.r[3] = F128::Set0001();
896  return Transpose(m);
897 #endif
898 }
899 
900 // XMMatrixLookAtLH
901 NLIB_M(SimdMatrix)
902 Matrix::LookAtLh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
903  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
904  SimdVector eye_dir = F128::Sub(at_pos, eye_pos);
905  eye_dir = Vector3::Normalize(eye_dir);
906  return LookToLh(eye_pos, eye_dir, up_dir_normalized);
907 }
908 
909 // XMMatrixLookToRH
910 NLIB_M(SimdMatrix)
911 Matrix::LookToRh(SimdVectorArg eye_pos, SimdVectorArg eye_dir_normalized,
912  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
913  return LookToLh(eye_pos, F128::Negate(eye_dir_normalized), up_dir_normalized);
914 }
915 
916 // XMMatrixLookAtRH
917 NLIB_M(SimdMatrix)
918 Matrix::LookAtRh(SimdVectorArg eye_pos, SimdVectorArg at_pos,
919  SimdVectorArg up_dir_normalized) NLIB_NOEXCEPT {
920  SimdVector eye_dir = F128::Sub(eye_pos, at_pos);
921  eye_dir = Vector3::Normalize(eye_dir);
922  return LookToLh(eye_pos, eye_dir, up_dir_normalized);
923 }
924 
925 // XMMatrixPerspectiveLH
926 NLIB_M(SimdMatrix)
927 Matrix::PerspectiveLh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT {
928  float near2 = near_z + near_z;
929  float range = far_z / (far_z - near_z);
930  f128 zero = F128::SetZero();
931  f128 v = F128::SetValue(near2 / width, near2 / height, range, -range * near_z);
932  SimdMatrix m;
933  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
934  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
935  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
936  m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
937  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
938  return m;
939 }
940 
941 // XMMatrixPerspectiveRH
942 NLIB_M(SimdMatrix)
943 Matrix::PerspectiveRh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT {
944  float near2 = near_z + near_z;
945  float range = far_z / (near_z - far_z);
946  f128 zero = F128::SetZero();
947  f128 v = F128::SetValue(near2 / width, near2 / height, range, range * near_z);
948  SimdMatrix m;
949  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
950  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
951  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
952  m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
953  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
954  return m;
955 }
956 
957 // XMMatrixPerspectiveFovLH
958 NLIB_M(SimdMatrix)
959 Matrix::PerspectiveFovLh(float half_fovy_sin, float half_fovy_cos, float aspect, float near_z,
960  float far_z) NLIB_NOEXCEPT {
961  float height = half_fovy_cos / half_fovy_sin;
962  float width = height / aspect;
963  float range = far_z / (far_z - near_z);
964 
965  f128 zero = F128::SetZero();
966  f128 v = F128::SetValue(width, height, range, -range * near_z);
967  SimdMatrix m;
968  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
969  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
970  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
971  m.r[2] = F128::SetFloatToLane<3>(tmp, 1.f);
972  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
973  return m;
974 }
975 
976 // XMMatrixPerspectiveFovRH
977 NLIB_M(SimdMatrix)
978 Matrix::PerspectiveFovRh(float half_fovy_sin, float half_fovy_cos, float aspect, float near_z,
979  float far_z) NLIB_NOEXCEPT {
980  float height = half_fovy_cos / half_fovy_sin;
981  float width = height / aspect;
982  float range = far_z / (near_z - far_z);
983 
984  f128 zero = F128::SetZero();
985  f128 v = F128::SetValue(width, height, range, range * near_z);
986  SimdMatrix m;
987  m.r[0] = F128::Splat<false, true, true, true>(v, zero);
988  m.r[1] = F128::Splat<true, false, true, true>(v, zero);
989  f128 tmp = F128::Splat<true, true, false, true>(v, zero);
990  m.r[2] = F128::SetFloatToLane<3>(tmp, -1.f);
991  m.r[3] = F128::Permute<0, 1, 7, 0>(zero, v);
992  return m;
993 }
994 
995 // XMMatrixPerspectiveOffCenterLH
996 NLIB_M(SimdMatrix)
997 Matrix::PerspectiveOffCenterLh(float left, float right, float bottom, float top, float near_z,
998  float far_z) NLIB_NOEXCEPT {
999  float near2 = near_z + near_z;
1000  f128 div;
1001  {
1002  f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
1003  f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
1004  div = F128::Div(a, b);
1005  // recpWidth, recpHeight, range, 1.f
1006  }
1007  f128 zero = F128::SetZero();
1008  f128 v0 = F128::SetValue(near2, near2, -near_z, 1.f);
1009  f128 r2 = F128::SetValue(-(left + right), -(top + bottom), 1.f, 1.f);
1010  v0 = F128::Mult(v0, div);
1011 
1012  SimdMatrix m;
1013  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1014  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1015  m.r[2] = F128::Mult(r2, div);
1016  m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
1017  return m;
1018 }
1019 
1020 // XMMatrixPerspectiveOffCenterRH
1021 NLIB_M(SimdMatrix)
1022 Matrix::PerspectiveOffCenterRh(float left, float right, float bottom, float top, float near_z,
1023  float far_z) NLIB_NOEXCEPT {
1024  float near2 = near_z + near_z;
1025  f128 div;
1026  {
1027  f128 a = F128::SetValue(1.f, 1.f, far_z, 1.f);
1028  f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, -1.f);
1029  div = F128::Div(a, b);
1030  // recpWidth, recpHeight, range, 1.f
1031  }
1032  f128 zero = F128::SetZero();
1033  f128 v0 = F128::SetValue(near2, near2, near_z, 1.f);
1034  f128 r2 = F128::SetValue((left + right), (top + bottom), 1.f, 1.f);
1035  v0 = F128::Mult(v0, div);
1036 
1037  SimdMatrix m;
1038  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1039  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1040  m.r[2] = F128::Mult(r2, div);
1041  m.r[3] = F128::Splat<true, true, false, true>(v0, zero);
1042  return m;
1043 }
1044 
1045 // XMMatrixOrthographicLH
1046 NLIB_M(SimdMatrix)
1047 Matrix::OrthographicLh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT {
1048  f128 div;
1049  {
1050  f128 a = F128::SetValue(2.f, 2.f, 1.f, -near_z);
1051  f128 b = F128::SetValue(width, height, far_z - near_z, far_z - near_z);
1052  div = F128::Div(a, b);
1053  }
1054  f128 zero = F128::SetZero();
1055 
1056  SimdMatrix m;
1057  m.r[0] = F128::Splat<false, true, true, true>(div, zero);
1058  m.r[1] = F128::Splat<true, false, true, true>(div, zero);
1059  m.r[2] = F128::Splat<true, true, false, true>(div, zero);
1060  f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1061  m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1062  return m;
1063 }
1064 
1065 // XMMatrixOrthographicRH
1066 NLIB_M(SimdMatrix)
1067 Matrix::OrthographicRh(float width, float height, float near_z, float far_z) NLIB_NOEXCEPT {
1068  f128 div;
1069  {
1070  f128 a = F128::SetValue(2.f, 2.f, 1.f, near_z);
1071  f128 b = F128::SetValue(width, height, near_z - far_z, near_z - far_z);
1072  div = F128::Div(a, b);
1073  }
1074  f128 zero = F128::SetZero();
1075 
1076  SimdMatrix m;
1077  m.r[0] = F128::Splat<false, true, true, true>(div, zero);
1078  m.r[1] = F128::Splat<true, false, true, true>(div, zero);
1079  m.r[2] = F128::Splat<true, true, false, true>(div, zero);
1080  f128 tmp = F128::Permute<0, 1, 7, 1>(zero, div);
1081  m.r[3] = F128::SetFloatToLane<3>(tmp, 1.f);
1082  return m;
1083 }
1084 
1085 // XMMatrixOrthographicOffCenterLH
1086 NLIB_M(SimdMatrix)
1087 Matrix::OrthographicOffCenterLh(float left, float right, float bottom, float top, float near_z,
1088  float far_z) NLIB_NOEXCEPT {
1089  f128 div;
1090  {
1091  f128 a = F128::SetOne();
1092  f128 b = F128::SetValue(right - left, top - bottom, far_z - near_z, 1.f);
1093  div = F128::Div(a, b);
1094  }
1095  f128 zero = F128::SetZero();
1096  f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1097  f128 r3 = F128::SetValue(-(left + right), -(top + bottom), -near_z, 1.f);
1098  v0 = F128::Mult(v0, div);
1099 
1100  SimdMatrix m;
1101  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1102  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1103  m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
1104  m.r[3] = F128::Mult(r3, div);
1105  return m;
1106 }
1107 
1108 // XMMatrixOrthographicOffCenterRH
1109 NLIB_M(SimdMatrix)
1110 Matrix::OrthographicOffCenterRh(float left, float right, float bottom, float top, float near_z,
1111  float far_z) NLIB_NOEXCEPT {
1112  f128 div;
1113  {
1114  f128 a = F128::SetOne();
1115  f128 b = F128::SetValue(right - left, top - bottom, near_z - far_z, 1.f);
1116  div = F128::Div(a, b);
1117  }
1118  f128 zero = F128::SetZero();
1119  f128 v0 = F128::SetValue(2.f, 2.f, 1.f, 1.f);
1120  f128 r3 = F128::SetValue(-(left + right), -(top + bottom), near_z, 1.f);
1121  v0 = F128::Mult(v0, div);
1122 
1123  SimdMatrix m;
1124  m.r[0] = F128::Splat<false, true, true, true>(v0, zero);
1125  m.r[1] = F128::Splat<true, false, true, true>(v0, zero);
1126  m.r[2] = F128::Splat<true, true, false, true>(v0, zero);
1127  m.r[3] = F128::Mult(r3, div);
1128  return m;
1129 }
1130 
1131 // XMMatrixShadow
1132 NLIB_M(SimdMatrix) Matrix::Shadow(SimdPlaneArg shadow_plane, SimdVector light_pos) NLIB_NOEXCEPT {
1133  // Plane::Normalize(shadow_plane);
1134  SimdPlane plane = F128::Mult(Vector3::RecpLength(shadow_plane), shadow_plane);
1135  // distance from plane to the light
1136  f128 r0 = Vector4::DotEx<true, false, false, false>(plane, light_pos);
1137  plane = F128::Negate(plane);
1138  f128 r1 = F128::RotateLeft<1>(r0);
1139  f128 r2 = F128::RotateLeft<2>(r0);
1140  f128 r3 = F128::RotateLeft<3>(r0);
1141 
1142  SimdMatrix m;
1143  m.r[0] = F128::MultAdd<0>(plane, light_pos, r0, each_select32);
1144  m.r[1] = F128::MultAdd<1>(plane, light_pos, r1, each_select32);
1145  m.r[2] = F128::MultAdd<2>(plane, light_pos, r2, each_select32);
1146  m.r[3] = F128::MultAdd<3>(plane, light_pos, r3, each_select32);
1147  return m;
1148 }
1149 
1150 // XMMatrixReflect
1151 NLIB_M(SimdMatrix) Matrix::Reflect(SimdPlaneArg reflection_plane) NLIB_NOEXCEPT {
1152  // SimdPlane plane = Plane::Normalize(reflection_plane);
1153  SimdPlane plane = F128::Mult(Vector3::RecpLength(reflection_plane), reflection_plane);
1154  f128 minus_2n = F128::Mult(-2.f, plane);
1155  minus_2n = F128::SetZeroToLane<3>(minus_2n);
1156 
1157  SimdMatrix m = Matrix::Identity();
1158  m.r[0] = F128::MultAdd<0>(plane, minus_2n, m.r[0], each_select32);
1159  m.r[1] = F128::MultAdd<1>(plane, minus_2n, m.r[1], each_select32);
1160  m.r[2] = F128::MultAdd<2>(plane, minus_2n, m.r[2], each_select32);
1161  m.r[3] = F128::MultAdd<3>(plane, minus_2n, m.r[3], each_select32);
1162  return m;
1163 }
1164 
1165 // different from XMMatrixDecompose
1166 // note that scale.xyz >= 0
1167 NLIB_M(void)
1168 Matrix::Decompose(SimdVector* scale, SimdMatrix* rot, SimdVector* trans,
1169  SimdMatrixArg m) NLIB_NOEXCEPT {
1170  // translation
1171  *trans = m.r[3];
1172 
1173  // scaling
1174  f128 recp_scale;
1175  {
1176  f128 dot_x = Vector3::DotEx<true, false, false, true>(m.r[0], m.r[0]);
1177  f128 dot_y = Vector3::DotEx<false, true, false, true>(m.r[1], m.r[1]);
1178  f128 dot_z = Vector3::DotEx<false, false, true, true>(m.r[2], m.r[2]);
1179  f128 dot = F128::Or(dot_x, dot_y);
1180  dot = F128::Or(dot, dot_z);
1181  recp_scale = F128::RecpSqrt(dot);
1182  *scale = F128::Mult(dot, recp_scale);
1183  }
1184 
1185  // rotation
1186  rot->r[0] = F128::Mult<0>(recp_scale, m.r[0], each_select32);
1187  rot->r[1] = F128::Mult<1>(recp_scale, m.r[1], each_select32);
1188  rot->r[2] = F128::Mult<2>(recp_scale, m.r[2], each_select32);
1189  rot->r[3] = F128::Set0001();
1190 }
1191 
1192 #undef NLIB_M
1193 
1194 #endif // NLIB_DOXYGEN
1195 
1196 } // namespace simd
1197 NLIB_NAMESPACE_END
1198 
1199 #endif // INCLUDE_NN_NLIB_SIMD_SIMDMATRIX_H_
The class with the collection of functions that handle 4x4 matrices.
Definition: SimdMatrix.h:28
Defines a quaternion.
f128arg SimdVectorArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4157
Definition: Base64.h:25
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:86
#define NLIB_F128_TRANSPOSE(row0, row1, row2, row3)
A macro for in-place matrix transposition.
Definition: SimdFloat.h:4231
constexpr const each_float_tag each_float
The tag for representing a single-precision floating-point number with an each_float_tag-type constan...
Definition: SimdFloat.h:74
f128arg SimdQuaternionArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4159
f128arg SimdPlaneArg
f128arg is defined using typedef.
Definition: SimdFloat.h:4161
f128 r[4]
Keeps each row of a 4x4 matrix.
Definition: SimdFloat.h:4184
The structure for keeping a 4x4 matrix.
Definition: SimdFloat.h:4168
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Config.h:109
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
Definition: SimdInt.h:63
The type for reading and writing 4x3 matrices in memory. The data member m is a 4x3 matrix...
Definition: SimdFloat.h:4330
The type for reading and writing 3x3 matrices in memory. The data member m is a 3x3 matrix...
Definition: SimdFloat.h:4315
Defines a three-dimensional vector.
nlib_f128_t f128
nlib_f128_t is defined using typedef.
Definition: SimdFloat.h:77
Defines a four-dimensional vector.
The type for reading and writing 4x4 matrices in memory. The data member m is a 4x4 matrix...
Definition: SimdFloat.h:4338
The type for reading and writing 3x4 matrices in memory. The data member m is a 3x4 matrix...
Definition: SimdFloat.h:4322
f128 SimdPlane
f128 is defined using typedef. Used when handling planes.
Definition: SimdFloat.h:4160
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...
Definition: SimdFloat.h:4156