3 #ifndef INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_
4 #define INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_
13 template<
typename MyVector4>
15 template<
typename MyVector4>
18 return F128::IsAllMaskTrue(F128::CmpEq(vec1, vec2));
21 return F128::IsAllMaskTrue(F128::CmpLt(vec1, vec2));
24 return F128::IsAllMaskTrue(F128::CmpLe(vec1, vec2));
27 return F128::IsAllMaskTrue(F128::CmpGt(vec1, vec2));
30 return F128::IsAllMaskTrue(F128::CmpGe(vec1, vec2));
33 return F128::IsAllMaskTrue(F128::CmpNe(vec1, vec2));
35 static bool __vectorcall
37 return F128::IsAllMaskTrue(F128::CmpNearEq(vec1, vec2, eps));
40 return !F128::IsAllMaskFalse(F128::IsNaN(vec));
43 return !F128::IsAllMaskFalse(F128::IsInfinite(vec));
46 return F128::IsAllMaskTrue(F128::InBound(vec, bounds));
50 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
52 static f128 __vectorcall
54 static f128 __vectorcall
80 #define NLIB_M(tp) inline tp __vectorcall
81 #define NLIB_MH(tp) inline tp __vectorcall
83 template<
typename MyVector4>
91 return F128::LoadA4(reinterpret_cast<const float*>(&p->x));
94 template<
typename MyVector4>
97 NLIB_MH(
void) Vector4::StoreFloat4(MyVector4* p,
SimdVectorArg vec) NLIB_NOEXCEPT {
102 F128::StoreA4(reinterpret_cast<float*>(&p->x), vec);
107 #ifdef NLIB_F128_SIMD_NOUSE
109 ret.vec.v[0] = ret.vec.v[1] = ret.vec.v[2] = ret.vec.v[3] =
110 vec1.vec.v[0] * vec2.vec.v[0] + vec1.vec.v[1] * vec2.vec.v[1] +
111 vec1.vec.v[2] * vec2.vec.v[2] + vec1.vec.v[3] * vec2.vec.v[3];
113 #elif defined(NLIB_SSE41)
114 return _mm_dp_ps(vec1, vec2, 0xFF);
115 #elif defined(NLIB_NEON)
116 float32x4_t tmp = vmulq_f32(vec1, vec2);
117 float32x2_t v1 = vget_low_f32(tmp);
118 float32x2_t v2 = vget_high_f32(tmp);
119 v1 = vpadd_f32(v1, v1);
120 v2 = vpadd_f32(v2, v2);
121 v1 = vadd_f32(v1, v2);
122 return vcombine_f32(v1, v1);
123 #elif defined(NLIB_CAFE_PPC)
124 f128 tmp = F128::Mult(vec1, vec2);
125 f32x2 val = __PS_ADD(tmp.vec.ps[0], tmp.vec.ps[1]);
126 val = __PS_SUM0(val, val, val);
128 ret.vec.ps[0] = ret.vec.ps[1] = __PS_FDUP(val[0]);
133 template <
bool SetLane0,
bool SetLane1,
bool SetLane2,
bool SetLane3>
136 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_SSE41)
137 return _mm_dp_ps(vec1, vec2, (0xF0 | (SetLane0 ? 1 : 0) | (SetLane1 ? 2 : 0) |
138 (SetLane2 ? 4 : 0) | (SetLane3 ? 8 : 0)));
140 return F128::Splat<SetLane0, SetLane1, SetLane2, SetLane3>(F128::SetZero(), Dot(vec1, vec2));
146 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
147 float32x4x2_t trn = vtrnq_f32(b0, b1);
148 float32x2_t lo = vget_low_f32(a);
149 float32x2_t hi = vget_high_f32(a);
150 float32x4_t xxzz = vcombine_f32(vdup_lane_f32(lo, 0), vdup_lane_f32(hi, 0));
151 float32x4_t yyww = vcombine_f32(vdup_lane_f32(lo, 1), vdup_lane_f32(hi, 1));
152 float32x4_t tmp = vmulq_f32(trn.val[0], xxzz);
153 tmp = vmlaq_f32(tmp, yyww, trn.val[1]);
154 float32x2_t result = vadd_f32(vget_low_f32(tmp), vget_high_f32(tmp));
155 return vcombine_f32(result, vdup_n_f32(0.f));
157 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
158 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
159 return F128::Or(t0, t1);
166 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
167 f128 row0, row1, row2, row3;
168 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
169 float32x4x2_t trn_f1_ = vtrnq_f32(b2, vdupq_n_f32(0.f));
170 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
171 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
172 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
173 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
179 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
180 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
181 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
182 return F128::Or(F128::Or(t0, t1), t2);
189 #if !defined(NLIB_F128_SIMD_NOUSE) && defined(NLIB_NEON)
190 f128 row0, row1, row2, row3;
191 float32x4x2_t trn_f0_ = vtrnq_f32(b0, b1);
192 float32x4x2_t trn_f1_ = vtrnq_f32(b2, b3);
193 row0 = vcombine_f32(vget_low_f32(trn_f0_.val[0]), vget_low_f32(trn_f1_.val[0]));
194 row1 = vcombine_f32(vget_low_f32(trn_f0_.val[1]), vget_low_f32(trn_f1_.val[1]));
195 row2 = vcombine_f32(vget_high_f32(trn_f0_.val[0]), vget_high_f32(trn_f1_.val[0]));
196 row3 = vcombine_f32(vget_high_f32(trn_f0_.val[1]), vget_high_f32(trn_f1_.val[1]));
202 f128 t0 = Vector4::DotEx<true, false, false, false>(a, b0);
203 f128 t1 = Vector4::DotEx<false, true, false, false>(a, b1);
204 f128 t2 = Vector4::DotEx<false, false, true, false>(a, b2);
205 f128 t3 = Vector4::DotEx<false, false, false, true>(a, b3);
206 return F128::Or(F128::Or(t0, t1), F128::Or(t2, t3));
212 f128 dot = Vector4::Dot(vec, vec);
213 f128 rsqrt = F128::RecpSqrt(dot);
214 f128 inf = F128::SetInfinity();
215 f128 zero = F128::SetZero();
216 f128 eqzero = F128::CmpEq(dot, zero);
217 f128 eqinf = F128::CmpEq(dot, inf);
219 f128 nan = F128::SetNaN();
220 ret = F128::Select(eqzero, zero, ret);
221 ret = F128::Select(eqinf, nan, ret);
226 NLIB_M(
f128) Vector4::LengthSq(
SimdVectorArg vec) NLIB_NOEXCEPT {
return Dot(vec, vec); }
230 return F128::Sqrt(Dot(vec, vec));
235 return F128::SqrtEst(Dot(vec, vec));
240 return F128::RecpSqrt(Dot(vec, vec));
245 return F128::RecpSqrtEst(Dot(vec, vec));
249 return F128::Mult(vec, RecpLengthEst(vec));
255 f128 ret = Dot(vec1_normalized, vec2_normalized);
256 ret = F128::Clamp(ret, F128::SetNegativeOne(), F128::SetOne());
257 return F128::ArcCos(ret);
261 f128 s = Dot(vec, normal);
263 return F128::MultSub(s, normal, vec);
322 #endif // NLIB_DOXYGEN
327 #endif // INCLUDE_NN_NLIB_SIMD_SIMDVECTOR4_H_
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
static bool InBound(SimdVectorArg vec, SimdVectorArg bounds) noexcept
Evaluates whether the elements of vec are inside the bounds of bounds.
static bool CmpNearEq(SimdVectorArg vec1, SimdVectorArg vec2, SimdVectorArg eps) noexcept
Compares vec1 and vec2 to see whether they are nearly equal.
f128arg SimdVectorArg
f128arg is defined using typedef.
static bool IsNaN(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are NaN.
static bool CmpGe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or greater than the corresponding element...
static bool CmpGt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are larger than the corresponding elements of vec2...
The structure for keeping a 4x4 matrix.
static bool CmpNe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares vec1 and vec2 to see whether they are unequal.
The class with the collection of functions that perform calculations on four-dimensional vectors...
Defines the class and functions for SIMD computations on single-precision floating-point numbers...
constexpr const each_select32_tag each_select32
The tag for representing the selection of a 32-bit lane with an each_select32_tag-type constant objec...
static bool CmpEq(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Compares two four-dimensional vectors to see if they are equal.
nlib_f128_t f128
nlib_f128_t is is defined using typedef.
static bool CmpLt(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are smaller than the corresponding elements of vec2...
#define NLIB_STATIC_ASSERT(exp)
Defines a static assertion. Uses static_assert if it is available for use.
static bool IsInfinite(SimdVectorArg vec) noexcept
Evaluates whether any of the elements of vec are positive infinity or negative infinity.
static bool CmpLe(SimdVectorArg vec1, SimdVectorArg vec2) noexcept
Checks to see whether all of the elements of vec1 are equal or less than the corresponding elements o...
f128 SimdVector
f128 is defined using typedef. Used when handling three-dimensional or four-dimensional vectors...