nlib
SimdVector4.h
[詳解]
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_
5 
7 
8 NLIB_NAMESPACE_BEGIN
9 namespace simd {
10 
12  public:
13  template<typename MyVector4>
14  static SimdVector __vectorcall LoadFloat4(const MyVector4* p) NLIB_NOEXCEPT;
15  template<typename MyVector4>
16  static void __vectorcall StoreFloat4(MyVector4* p, SimdVectorArg vec) NLIB_NOEXCEPT;
17  static bool __vectorcall CmpEq(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
18  return F128::IsAllMaskTrue(F128::CmpEq(vec1, vec2));
19  }
20  static bool __vectorcall CmpLt(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
21  return F128::IsAllMaskTrue(F128::CmpLt(vec1, vec2));
22  }
23  static bool __vectorcall CmpLe(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
24  return F128::IsAllMaskTrue(F128::CmpLe(vec1, vec2));
25  }
26  static bool __vectorcall CmpGt(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
27  return F128::IsAllMaskTrue(F128::CmpGt(vec1, vec2));
28  }
29  static bool __vectorcall CmpGe(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
30  return F128::IsAllMaskTrue(F128::CmpGe(vec1, vec2));
31  }
32  static bool __vectorcall CmpNe(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
33  return F128::IsAllMaskTrue(F128::CmpNe(vec1, vec2));
34  }
35  static bool __vectorcall
36  CmpNearEq(SimdVectorArg vec1, SimdVectorArg vec2, SimdVectorArg eps) NLIB_NOEXCEPT {
37  return F128::IsAllMaskTrue(F128::CmpNearEq(vec1, vec2, eps));
38  }
39  static bool __vectorcall IsNaN(SimdVectorArg vec) NLIB_NOEXCEPT {
40  return !F128::IsAllMaskFalse(F128::IsNaN(vec));
41  }
42  static bool __vectorcall IsInfinite(SimdVectorArg vec) NLIB_NOEXCEPT {
43  return !F128::IsAllMaskFalse(F128::IsInfinite(vec));
44  }
45  static bool __vectorcall InBound(SimdVectorArg vec, SimdVectorArg bounds) NLIB_NOEXCEPT {
46  return F128::IsAllMaskTrue(F128::InBound(vec, bounds));
47  }
48 
49  static f128 __vectorcall Dot(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT;
50  template <bool SetLane0, bool SetLane1, bool SetLane2, bool SetLane3>
51  static f128 __vectorcall DotEx(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT;
52  static f128 __vectorcall
54  static f128 __vectorcall
56  static f128 __vectorcall Dot4(SimdVectorArg a, SimdVectorArg b0, SimdVectorArg b1,
58  static SimdVector __vectorcall Normalize(SimdVectorArg vec) NLIB_NOEXCEPT;
59  static SimdVector __vectorcall NormalizeEst(SimdVectorArg vec) NLIB_NOEXCEPT;
60  static f128 __vectorcall LengthSq(SimdVectorArg vec) NLIB_NOEXCEPT;
61  static f128 __vectorcall Length(SimdVectorArg vec) NLIB_NOEXCEPT;
62  static f128 __vectorcall LengthEst(SimdVectorArg vec) NLIB_NOEXCEPT;
63  static f128 __vectorcall RecpLength(SimdVectorArg vec) NLIB_NOEXCEPT;
64  static f128 __vectorcall RecpLengthEst(SimdVectorArg vec) NLIB_NOEXCEPT;
65  static f128 __vectorcall GetAngle(SimdVectorArg vec1_normalized,
66  SimdVectorArg vec2_normalized) NLIB_NOEXCEPT;
67  static SimdVector __vectorcall Reflect(SimdVectorArg vec, SimdVectorArg normal) NLIB_NOEXCEPT;
68  // static SimdVector __vectorcall Transform(SimdMatrixArg m, SimdVectorArg vec) NLIB_NOEXCEPT;
69  static SimdVector __vectorcall Transform(SimdVectorArg vec, SimdMatrixArg m) NLIB_NOEXCEPT;
70 
71  // TransformSequence(....)
72  // Refract, Ortho
73 
74  private:
75  Vector4(); // forbidden
76 };
77 
78 #ifndef NLIB_DOXYGEN
79 
80 #define NLIB_M(tp) inline tp __vectorcall
81 #define NLIB_MH(tp) inline tp __vectorcall
82 
83 template<typename MyVector4>
84 // MyVector is a structure like 'struct MyVector4 { float x, y, z, w; };'
85 // nlib_ns::simd::Float4, DirectX::XMFLOAT3, DirectX::XMFLOAT3A for example
86 NLIB_MH(SimdVector) Vector4::LoadFloat4(const MyVector4* p) NLIB_NOEXCEPT {
87  NLIB_STATIC_ASSERT(sizeof(p->x) == 4);
88  NLIB_STATIC_ASSERT(sizeof(p->y) == 4);
89  NLIB_STATIC_ASSERT(sizeof(p->z) == 4);
90  NLIB_STATIC_ASSERT(sizeof(p->w) == 4);
91  return F128::LoadA4(reinterpret_cast<const float*>(&p->x));
92 }
93 
94 template<typename MyVector4>
95 // MyVector is a structure like 'struct MyVector4 { float x, y, z, w; };'
96 // nlib_ns::simd::Float4, DirectX::XMFLOAT3, DirectX::XMFLOAT3A for example
97 NLIB_MH(void) Vector4::StoreFloat4(MyVector4* p, SimdVectorArg vec) NLIB_NOEXCEPT { // NOLINT
98  NLIB_STATIC_ASSERT(sizeof(p->x) == 4);
99  NLIB_STATIC_ASSERT(sizeof(p->y) == 4);
100  NLIB_STATIC_ASSERT(sizeof(p->z) == 4);
101  NLIB_STATIC_ASSERT(sizeof(p->w) == 4);
102  F128::StoreA4(reinterpret_cast<float*>(&p->x), vec);
103 }
104 
105 // r = { dot, dot, dot, dot }
106 NLIB_M(f128) Vector4::Dot(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
107 #ifdef NLIB_F128_SIMD_NOUSE
108  f128 ret;
109  ret.vec.v[0] = ret.vec.v[1] = ret.vec.v[2] = ret.vec.v[3] =
110  vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
111  vec1.vec.v[2] * vec2.vec.v[2] + vec1.vec.v[3] * vec2.vec.v[3];
112  return ret;
113 #elif defined(NLIB_SSE41)
114  return _mm_dp_ps(vec1, vec2, 0xFF);
115 #elif defined(NLIB_NEON)
116  float32x4_t tmp = vmulq_f32(vec1, vec2);
117  float32x2_t v1 = vget_low_f32(tmp);
118  float32x2_t v2 = vget_high_f32(tmp);
119  v1 = vpadd_f32(v1, v1);
120  v2 = vpadd_f32(v2, v2);
121  v1 = vadd_f32(v1, v2);
122  return vcombine_f32(v1, v1);
123 #elif defined(NLIB_CAFE_PPC)
124  f128 tmp = F128::Mult(vec1, vec2);
125  f32x2 val = __PS_ADD(tmp.vec.ps[0], tmp.vec.ps[1]);
126  val = __PS_SUM0(val, val, val);
127  f128 ret;
128  ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
129  return ret;
130 #endif
131 }
132 
133 template <bool SetLane0, bool SetLane1, bool SetLane2, bool SetLane3>
134 // r[i] = SetLane[i] ? dot : 0.f
135 NLIB_MH(f128) Vector4::DotEx(SimdVectorArg vec1, SimdVectorArg vec2) NLIB_NOEXCEPT {
136 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41)
137  return _mm_dp_ps(vec1, vec2, (0xF0 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) |
138  (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
139 #else
140  return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
141 #endif
142 }
143 
144 // r = { dot(a, b0), dot(a, b1), 0.f, 0.f }
145 NLIB_M(f128) Vector4::Dot2(SimdVectorArg a, SimdVectorArg b0, SimdVectorArg b1) NLIB_NOEXCEPT {
146 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
147  float32x4x2_t trn = vtrnq_f32(b0, b1);
148  float32x2_t lo = vget_low_f32(a);
149  float32x2_t hi = vget_high_f32(a);
150  float32x4_t xxzz = vcombine_f32(vdup_lane_f32(lo, 0), vdup_lane_f32(hi, 0));
151  float32x4_t yyww = vcombine_f32(vdup_lane_f32(lo, 1), vdup_lane_f32(hi, 1));
152  float32x4_t tmp = vmulq_f32(trn.val[0], xxzz);
153  tmp = vmlaq_f32(tmp, yyww, trn.val[1]);
154  float32x2_t result = vadd_f32(vget_low_f32(tmp), vget_high_f32(tmp));
155  return vcombine_f32(result, vdup_n_f32(0.f));
156 #else
157  f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
158  f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
159  return F128::Or(t0, t1);
160 #endif
161 }
162 
163 // r = { dot(a, b0), dot(a, b1), dot(a, b2), 0.f }
164 NLIB_M(f128) Vector4::Dot3(SimdVectorArg a, SimdVectorArg b0, SimdVectorArg b1,
165  SimdVectorArg b2) NLIB_NOEXCEPT {
166 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
167  f128 row0, row1, row2, row3;
168  float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
169  float32x4x2_t trn_f1_ = vtrnq_f32(b2, vdupq_n_f32(0.f));
170  row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
171  row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
172  row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
173  row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
174  f128 ret = F128::Mult<0>(a, row0, each_select32);
175  ret = F128::MultAdd<1>(a, row1, ret, each_select32);
176  ret = F128::MultAdd<2>(a, row2, ret, each_select32);
177  return F128::MultAdd<3>(a, row3, ret, each_select32);
178 #else
179  f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
180  f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
181  f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
182  return F128::Or(F128::Or(t0, t1), t2);
183 #endif
184 }
185 
186 // r = { dot(a, b0), dot(a, b1), dot(a, b2), dot(a, b3) }
187 NLIB_M(f128) Vector4::Dot4(SimdVectorArg a, SimdVectorArg b0, SimdVectorArg b1, SimdVectorArg b2,
188  SimdVectorArg b3) NLIB_NOEXCEPT {
189 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
190  f128 row0, row1, row2, row3;
191  float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
192  float32x4x2_t trn_f1_ = vtrnq_f32(b2, b3);
193  row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
194  row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
195  row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
196  row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
197  f128 ret = F128::Mult<0>(a, row0, each_select32);
198  ret = F128::MultAdd<1>(a, row1, ret, each_select32);
199  ret = F128::MultAdd<2>(a, row2, ret, each_select32);
200  return F128::MultAdd<3>(a, row3, ret, each_select32);
201 #else
202  f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
203  f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
204  f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
205  f128 t3 = Vector4::DotEx<false, false, false, true>(a, b3);
206  return F128::Or(F128::Or(t0, t1), F128::Or(t2, t3));
207 #endif
208 }
209 
210 // nan if |vec| is too big, 0 if |vec| = 0
211 NLIB_M(SimdVector) Vector4::Normalize(SimdVectorArg vec) NLIB_NOEXCEPT {
212  f128 dot = Vector4::Dot(vec, vec);
213  f128 rsqrt = F128::RecpSqrt(dot);
214  f128 inf = F128::SetInfinity();
215  f128 zero = F128::SetZero();
216  f128 eqzero = F128::CmpEq(dot, zero);
217  f128 eqinf = F128::CmpEq(dot, inf);
218  SimdVector ret = F128::Mult(vec, rsqrt);
219  f128 nan = F128::SetNaN();
220  ret = F128::Select(eqzero, zero, ret);
221  ret = F128::Select(eqinf, nan, ret);
222  return ret;
223 }
224 
225 // r = { lensq, lensq, lensq, lensq }
226 NLIB_M(f128) Vector4::LengthSq(SimdVectorArg vec) NLIB_NOEXCEPT { return Dot(vec, vec); }
227 
228 // r = { len, len, len, len }
229 NLIB_M(f128) Vector4::Length(SimdVectorArg vec) NLIB_NOEXCEPT {
230  return F128::Sqrt(Dot(vec, vec));
231 }
232 
233 // r = { len, len, len, len }
234 NLIB_M(f128) Vector4::LengthEst(SimdVectorArg vec) NLIB_NOEXCEPT {
235  return F128::SqrtEst(Dot(vec, vec));
236 }
237 
238 // r = { recpLen, recpLen, recpLen, recpLen }
239 NLIB_M(f128) Vector4::RecpLength(SimdVectorArg vec) NLIB_NOEXCEPT {
240  return F128::RecpSqrt(Dot(vec, vec));
241 }
242 
243 // r = { recpLen, recpLen, recpLen, recpLen }
244 NLIB_M(f128) Vector4::RecpLengthEst(SimdVectorArg vec) NLIB_NOEXCEPT {
245  return F128::RecpSqrtEst(Dot(vec, vec));
246 }
247 
248 NLIB_M(SimdVector) Vector4::NormalizeEst(SimdVectorArg vec) NLIB_NOEXCEPT {
249  return F128::Mult(vec, RecpLengthEst(vec));
250 }
251 
252 // { radian, radian, radian, radian }
253 NLIB_M(f128) Vector4::GetAngle(SimdVectorArg vec1_normalized,
254  SimdVectorArg vec2_normalized) NLIB_NOEXCEPT {
255  f128 ret = Dot(vec1_normalized, vec2_normalized);
256  ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
257  return F128::ArcCos(ret);
258 }
259 
260 NLIB_M(SimdVector) Vector4::Reflect(SimdVectorArg vec, SimdVectorArg normal) NLIB_NOEXCEPT {
261  f128 s = Dot(vec, normal);
262  s = F128::Add(s, s);
263  return F128::MultSub(s, normal, vec);
264 }
265 
266 /*
267 // r = m * vec
268 NLIB_M(SimdVector) Vector4::Transform(SimdMatrixArg m, SimdVectorArg vec) NLIB_NOEXCEPT {
269 #ifdef NLIB_F128_SIMD_NOUSE
270  f128 ret;
271  ret.vec.v[0] = m.r[0].vec.v[0] * vec.vec.v[0] + m.r[0].vec.v[1] * vec.vec.v[1] +
272  m.r[0].vec.v[2] * vec.vec.v[2] + m.r[0].vec.v[3] * vec.vec.v[3];
273  ret.vec.v[1] = m.r[1].vec.v[0] * vec.vec.v[0] + m.r[1].vec.v[1] * vec.vec.v[1] +
274  m.r[1].vec.v[2] * vec.vec.v[2] + m.r[1].vec.v[3] * vec.vec.v[3];
275  ret.vec.v[2] = m.r[2].vec.v[0] * vec.vec.v[0] + m.r[2].vec.v[1] * vec.vec.v[1] +
276  m.r[2].vec.v[2] * vec.vec.v[2] + m.r[2].vec.v[3] * vec.vec.v[3];
277  ret.vec.v[3] = m.r[3].vec.v[0] * vec.vec.v[0] + m.r[3].vec.v[1] * vec.vec.v[1] +
278  m.r[3].vec.v[2] * vec.vec.v[2] + m.r[3].vec.v[3] * vec.vec.v[3];
279  return ret;
280 #elif defined(NLIB_SSE41)
281  f128 tmp, ret;
282  ret = _mm_dp_ps(m.r[0], vec, 0xF1);
283  tmp = _mm_dp_ps(m.r[1], vec, 0xF2);
284  ret = _mm_or_ps(ret, tmp);
285  tmp = _mm_dp_ps(m.r[2], vec, 0xF4);
286  ret = _mm_or_ps(ret, tmp);
287  tmp = _mm_dp_ps(m.r[3], vec, 0xF8);
288  ret = _mm_or_ps(ret, tmp);
289  return ret;
290 #elif defined(NLIB_NEON)
291  float32x4_t r0, r1;
292  float32x2_t lo, hi, tmp;
293 
294  r0 = vmulq_f32(m.r[0], vec);
295  r1 = vmulq_f32(m.r[1], vec);
296  lo = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
297  tmp = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
298  lo = vpadd_f32(lo, tmp);
299 
300  r0 = vmulq_f32(m.r[2], vec);
301  r1 = vmulq_f32(m.r[3], vec);
302  hi = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
303  tmp = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
304  hi = vpadd_f32(hi, tmp);
305 
306  return vcombine_f32(lo, hi);
307 #endif
308 }
309 */
310 
311 // r = vec * m
312 NLIB_M(SimdVector) Vector4::Transform(SimdVectorArg vec, SimdMatrixArg m) NLIB_NOEXCEPT {
313  f128 ret = F128::Mult<0>(vec, m.r[0], each_select32);
314  ret = F128::MultAdd<1>(vec, m.r[1], ret, each_select32);
315  ret = F128::MultAdd<2>(vec, m.r[2], ret, each_select32);
316  return F128::MultAdd<3>(vec, m.r[3], ret, each_select32);
317 }
318 
319 #undef NLIB_M
320 #undef NLIB_MH
321 
322 #endif // NLIB_DOXYGEN
323 
324 } // namespace simd
325 NLIB_NAMESPACE_END
326 
327 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_
#define NLIB_NOEXCEPT
環境に合わせてnoexcept 又は同等の定義がされます。
Definition: Platform.h:2151
static bool InBound(SimdVectorArg vec, SimdVectorArg bounds) noexcept
vec の要素がbounds の境界内であるかどうかを検証します。
Definition: SimdVector4.h:45
static bool CmpNearEq(SimdVectorArg vec1, SimdVectorArg vec2, SimdVectorArg eps) noexcept
vec1 とvec2 がほぼ等しいかどうか比較します。
Definition: SimdVector4.h:36
f128arg SimdVectorArg
f128argがtypedefされています。
Definition: SimdFloat.h:3927
#define NLIB_VIS_HIDDEN
関数やクラス等のシンボルをライブラリの外部に公開しません。
Definition: Platform_unix.h:50
static bool IsNaN(SimdVectorArg vec) noexcept
vec のいずれかの要素がNaNであるかどうかを検証します。
Definition: SimdVector4.h:39
static bool CmpGe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
vec1 の全ての要素がvec2 の対応する要素以上かどうか比較します。
Definition: SimdVector4.h:29
static bool CmpGt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
vec1 の全ての要素がvec2 の対応する要素よりも大きいかどうか比較します。
Definition: SimdVector4.h:26
4x4行列を保持する構造体です。
Definition: SimdFloat.h:3938
static bool CmpNe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
vec1 とvec2 が等しくないかどうか比較します。
Definition: SimdVector4.h:32
4次元ベクトルの計算を行う関数が集められたクラスです。
Definition: SimdVector4.h:11
単精度浮動小数点数のSIMD演算を行うためのクラスや関数が定義されています。
constexpr const each_select32_tag each_select32
each_select32_tag型の定数オブジェクトで、32bitのレーンを選択することを示すためのタグです。 ...
Definition: SimdInt.h:56
static bool CmpEq(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
2つの4次元ベクトルが等しいかどうか比較します。
Definition: SimdVector4.h:17
nlib_f128_t f128
nlib_f128_tがtypedefされています。
Definition: SimdFloat.h:54
static bool CmpLt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
vec1 の全ての要素がvec2 の対応する要素よりも小さいかどうか比較します。
Definition: SimdVector4.h:20
#define NLIB_STATIC_ASSERT(exp)
静的アサートが定義されます。利用可能であればstatic_assertを利用します。
Definition: Config.h:117
static bool IsInfinite(SimdVectorArg vec) noexcept
vec のいずれかの要素が正か負の無限大になっているかどうかを検証します。
Definition: SimdVector4.h:42
static bool CmpLe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
vec1 の全ての要素がvec2 の対応する要素以下かどうか比較します。
Definition: SimdVector4.h:23
f128 SimdVector
f128がtypedefされています。3次元ベクトル又は4次元ベクトルを扱う場合に利用されます。 ...
Definition: SimdFloat.h:3926