Line data Source code
1 : /**
2 : Copyright (c) 2022 Roman Katuntsev <sbkarr@stappler.org>
3 : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
4 :
5 : Permission is hereby granted, free of charge, to any person obtaining a copy
6 : of this software and associated documentation files (the "Software"), to deal
7 : in the Software without restriction, including without limitation the rights
8 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 : copies of the Software, and to permit persons to whom the Software is
10 : furnished to do so, subject to the following conditions:
11 :
12 : The above copyright notice and this permission notice shall be included in
13 : all copies or substantial portions of the Software.
14 :
15 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 : THE SOFTWARE.
22 : **/
23 :
24 : // Excluded from documentation/codegen tool
25 : ///@ SP_EXCLUDE
26 :
27 : #ifndef STAPPLER_GEOM_SPSIMD_SSE_H_
28 : #define STAPPLER_GEOM_SPSIMD_SSE_H_
29 :
30 : #include "SPSIMD.h"
31 : #include "simde/x86/sse.h"
32 :
33 : #if __SSE__
34 : #define SP_SIMD_SSE_STORE_VEC4(vec, value) *((simde__m128 *)&vec.x) = (value)
35 : #define SP_SIMD_SSE_LOAD_VEC4(vec) *((simde__m128 *)(&vec.x))
36 : #else
37 : #define SP_SIMD_SSE_STORE_VEC4(vec, value) simde_mm_store_ps(&vec.x, value)
38 : #define SP_SIMD_SSE_LOAD_VEC4(vec) simde_mm_load_ps(&vec.x)
39 : #endif
40 :
41 : namespace STAPPLER_VERSIONIZED stappler::simd::sse {
42 :
43 : using f32x4 = simde__m128;
44 :
45 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load(float v1, float v2, float v3, float v4) {
46 : return simde_mm_set_ps(v4, v3, v2, v1);
47 : }
48 :
49 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load(const float v[4]) {
50 : return simde_mm_load_ps(v);
51 : }
52 :
53 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load(float v) {
54 : return simde_mm_load1_ps(&v);
55 : }
56 :
57 : SP_ATTR_OPTIMIZE_INLINE_FN inline void store(float target[4], const f32x4 &v) {
58 141337 : simde_mm_store_ps(target, v);
59 : }
60 :
61 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 mul(const f32x4 &v1, const f32x4 &v2) {
62 98478 : return simde_mm_mul_ps(v1, v2);
63 : }
64 :
65 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 div(const f32x4 &v1, const f32x4 &v2) {
66 : return simde_mm_div_ps(v1, v2);
67 : }
68 :
69 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 add(const f32x4 &v1, const f32x4 &v2) {
70 43585 : return simde_mm_add_ps(v1, v2);
71 : }
72 :
73 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 sub(const f32x4 &v1, const f32x4 &v2) {
74 : return simde_mm_sub_ps(v1, v2);
75 : }
76 :
77 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 rsqrt(const f32x4 &v) {
78 : return simde_mm_rsqrt_ps(v);
79 : }
80 :
81 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load1(float v) {
82 : return simde_mm_load_ss(&v);
83 : }
84 :
85 : SP_ATTR_OPTIMIZE_INLINE_FN inline void store1(float *target, const f32x4 &v) {
86 : simde_mm_store_ss(target, v);
87 : }
88 :
89 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 mul1(const f32x4 &v1, const f32x4 &v2) {
90 : return simde_mm_mul_ss(v1, v2);
91 : }
92 :
93 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 add1(const f32x4 &v1, const f32x4 &v2) {
94 : return simde_mm_add_ss(v1, v2);
95 : }
96 :
97 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 sub1(const f32x4 &v1, const f32x4 &v2) {
98 : return simde_mm_sub_ss(v1, v2);
99 : }
100 :
101 : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 rsqrt1(const f32x4 &v) {
102 : return simde_mm_rsqrt_ss(v);
103 : }
104 :
105 : [[maybe_unused]] SP_ATTR_OPTIMIZE_INLINE_FN inline void loadMat4_impl(const float m[16], simde__m128 dst[4]) {
106 : dst[0] = simde_mm_load_ps(&m[0]);
107 : dst[1] = simde_mm_load_ps(&m[4]);
108 : dst[2] = simde_mm_load_ps(&m[8]);
109 : dst[3] = simde_mm_load_ps(&m[12]);
110 : }
111 :
112 : [[maybe_unused]] SP_ATTR_OPTIMIZE_INLINE_FN inline void storeMat4_impl(const simde__m128 m[4], float dst[16]) {
113 : simde_mm_store_ps((simde_float32 *)&dst[0], m[0]);
114 : simde_mm_store_ps((simde_float32 *)&dst[4], m[1]);
115 : simde_mm_store_ps((simde_float32 *)&dst[8], m[2]);
116 : simde_mm_store_ps((simde_float32 *)&dst[12], m[3]);
117 : }
118 :
119 : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4Scalar_impl(const simde__m128 *m, float scalar, simde__m128 *dst) {
120 : auto s = simde_mm_set1_ps(scalar);
121 : dst[0] = simde_mm_add_ps(m[0], s);
122 : dst[1] = simde_mm_add_ps(m[1], s);
123 : dst[2] = simde_mm_add_ps(m[2], s);
124 : dst[3] = simde_mm_add_ps(m[3], s);
125 : }
126 :
127 : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4_impl(const simde__m128 *m1, const simde__m128 *m2, simde__m128 *dst) {
128 : dst[0] = simde_mm_add_ps(m1[0], m2[0]);
129 : dst[1] = simde_mm_add_ps(m1[1], m2[1]);
130 : dst[2] = simde_mm_add_ps(m1[2], m2[2]);
131 : dst[3] = simde_mm_add_ps(m1[3], m2[3]);
132 : }
133 :
134 : SP_ATTR_OPTIMIZE_INLINE_FN inline void subtractMat4_impl(const simde__m128 *m1, const simde__m128 *m2, simde__m128 *dst) {
135 : dst[0] = simde_mm_sub_ps(m1[0], m2[0]);
136 : dst[1] = simde_mm_sub_ps(m1[1], m2[1]);
137 : dst[2] = simde_mm_sub_ps(m1[2], m2[2]);
138 : dst[3] = simde_mm_sub_ps(m1[3], m2[3]);
139 : }
140 :
141 : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4Scalar_impl(const simde__m128 *m, float scalar, simde__m128 *dst) {
142 : auto s = simde_mm_set1_ps(scalar);
143 82100 : dst[0] = simde_mm_mul_ps(m[0], s);
144 82100 : dst[1] = simde_mm_mul_ps(m[1], s);
145 82100 : dst[2] = simde_mm_mul_ps(m[2], s);
146 82100 : dst[3] = simde_mm_mul_ps(m[3], s);
147 : }
148 :
149 : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4_impl(const simde__m128 m1[4], const simde__m128 m2[4], simde__m128 dst[4]) {
150 : simde__m128 dst0, dst1, dst2, dst3;
151 : {
152 2697115 : simde__m128 e0 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
153 2697366 : simde__m128 e1 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
154 2697364 : simde__m128 e2 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
155 2697369 : simde__m128 e3 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
156 :
157 2697313 : simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
158 2697313 : simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
159 2697313 : simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
160 2697313 : simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
161 :
162 : simde__m128 a0 = simde_mm_add_ps(v0, v1);
163 : simde__m128 a1 = simde_mm_add_ps(v2, v3);
164 : simde__m128 a2 = simde_mm_add_ps(a0, a1);
165 :
166 : dst0 = a2;
167 : }
168 :
169 : {
170 2697313 : simde__m128 e0 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
171 2697381 : simde__m128 e1 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
172 2697383 : simde__m128 e2 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
173 2697383 : simde__m128 e3 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
174 :
175 : simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
176 : simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
177 : simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
178 : simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
179 :
180 : simde__m128 a0 = simde_mm_add_ps(v0, v1);
181 : simde__m128 a1 = simde_mm_add_ps(v2, v3);
182 : simde__m128 a2 = simde_mm_add_ps(a0, a1);
183 :
184 : dst1 = a2;
185 : }
186 :
187 : {
188 2697373 : simde__m128 e0 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
189 2697387 : simde__m128 e1 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
190 2697387 : simde__m128 e2 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
191 2697390 : simde__m128 e3 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
192 :
193 : simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
194 : simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
195 : simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
196 : simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
197 :
198 : simde__m128 a0 = simde_mm_add_ps(v0, v1);
199 : simde__m128 a1 = simde_mm_add_ps(v2, v3);
200 : simde__m128 a2 = simde_mm_add_ps(a0, a1);
201 :
202 : dst2 = a2;
203 : }
204 :
205 : {
206 2697327 : simde__m128 e0 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
207 2697353 : simde__m128 e1 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
208 2697357 : simde__m128 e2 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
209 2697359 : simde__m128 e3 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
210 :
211 : simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
212 : simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
213 : simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
214 : simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
215 :
216 : simde__m128 a0 = simde_mm_add_ps(v0, v1);
217 : simde__m128 a1 = simde_mm_add_ps(v2, v3);
218 : simde__m128 a2 = simde_mm_add_ps(a0, a1);
219 :
220 : dst3 = a2;
221 : }
222 2697348 : dst[0] = dst0;
223 2697348 : dst[1] = dst1;
224 2697348 : dst[2] = dst2;
225 2697348 : dst[3] = dst3;
226 : }
227 :
228 : SP_ATTR_OPTIMIZE_INLINE_FN inline void negateMat4_impl(const simde__m128 m[4], simde__m128 dst[4]) {
229 : simde__m128 z = simde_mm_setzero_ps();
230 : dst[0] = simde_mm_sub_ps(z, m[0]);
231 : dst[1] = simde_mm_sub_ps(z, m[1]);
232 : dst[2] = simde_mm_sub_ps(z, m[2]);
233 : dst[3] = simde_mm_sub_ps(z, m[3]);
234 : }
235 :
236 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transposeMat4_impl(const simde__m128 m[4], simde__m128 dst[4]) {
237 : simde__m128 tmp0 = simde_mm_shuffle_ps(m[0], m[1], 0x44);
238 : simde__m128 tmp2 = simde_mm_shuffle_ps(m[0], m[1], 0xEE);
239 : simde__m128 tmp1 = simde_mm_shuffle_ps(m[2], m[3], 0x44);
240 : simde__m128 tmp3 = simde_mm_shuffle_ps(m[2], m[3], 0xEE);
241 :
242 : dst[0] = simde_mm_shuffle_ps(tmp0, tmp1, 0x88);
243 : dst[1] = simde_mm_shuffle_ps(tmp0, tmp1, 0xDD);
244 : dst[2] = simde_mm_shuffle_ps(tmp2, tmp3, 0x88);
245 : dst[3] = simde_mm_shuffle_ps(tmp2, tmp3, 0xDD);
246 : }
247 :
248 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4Components_impl(const simde__m128 m[4], float x, float y, float z, float w, simde__m128& dst) {
249 : simde__m128 col1 = simde_mm_set1_ps(x);
250 : simde__m128 col2 = simde_mm_set1_ps(y);
251 : simde__m128 col3 = simde_mm_set1_ps(z);
252 : simde__m128 col4 = simde_mm_set1_ps(w);
253 :
254 7009 : dst = simde_mm_add_ps(
255 : simde_mm_add_ps(simde_mm_mul_ps(m[0], col1), simde_mm_mul_ps(m[1], col2)),
256 : simde_mm_add_ps(simde_mm_mul_ps(m[2], col3), simde_mm_mul_ps(m[3], col4))
257 : );
258 : }
259 :
260 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4_impl(const simde__m128 m[4], const simde__m128 &v, simde__m128& dst) {
261 : simde__m128 col1 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
262 : simde__m128 col2 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
263 : simde__m128 col3 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
264 : simde__m128 col4 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
265 :
266 : dst = simde_mm_add_ps(
267 : simde_mm_add_ps(simde_mm_mul_ps(m[0], col1), simde_mm_mul_ps(m[1], col2)),
268 : simde_mm_add_ps(simde_mm_mul_ps(m[2], col3), simde_mm_mul_ps(m[3], col4))
269 : );
270 : }
271 :
272 : #if SP_GEOM_DEFAULT_SIMD == SP_GEOM_DEFAULT_SIMD_SSE
273 :
274 : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4Scalar(const float m[16], float scalar, float dst[16]) {
275 : addMat4Scalar_impl((const simde__m128 *)m, scalar, (simde__m128 *)dst);
276 : }
277 :
278 : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4(const float m1[16], const float m2[16], float dst[16]) {
279 : addMat4_impl((const simde__m128 *)m1, (const simde__m128 *)m2, (simde__m128 *)dst);
280 : }
281 :
282 : SP_ATTR_OPTIMIZE_INLINE_FN inline void subtractMat4(const float m1[16], const float m2[16], float dst[16]) {
283 : subtractMat4_impl((const simde__m128 *)m1, (const simde__m128 *)m2, (simde__m128 *)dst);
284 : }
285 :
286 : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4Scalar(const float m[16], float scalar, float dst[16]) {
287 : multiplyMat4Scalar_impl((const simde__m128 *)m, scalar, (simde__m128 *)dst);
288 : }
289 :
290 : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4(const float m1[16], const float m2[16], float dst[16]) {
291 : multiplyMat4_impl((const simde__m128 *)m1, (const simde__m128 *)m2, (simde__m128 *)dst);
292 : }
293 :
294 : SP_ATTR_OPTIMIZE_INLINE_FN inline void negateMat4(const float m[16], float dst[16]) {
295 : negateMat4_impl((const simde__m128 *)m, (simde__m128 *)dst);
296 : }
297 :
298 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transposeMat4(const float m[16], float dst[16]) {
299 : transposeMat4_impl((const simde__m128 *)m, (simde__m128 *)dst);
300 : }
301 :
302 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4Components(const float m[16], float x, float y, float z, float w, float dst[4]) {
303 : transformVec4Components_impl((const simde__m128 *)m, x, y, z, w, *(simde__m128 *)dst);
304 : }
305 :
306 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4(const float m[16], const float v[4], float dst[4]) {
307 : transformVec4_impl((const simde__m128 *)m, *(const simde__m128 *)v, *(simde__m128 *)dst);
308 : }
309 :
310 : #else
311 :
312 : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4Scalar(const float m[16], float scalar, float dst[16]) {
313 : simde__m128 dstM[4];
314 : simde__m128 tmpM[4];
315 :
316 : loadMat4_impl(m, tmpM);
317 : addMat4Scalar_impl(tmpM, scalar, dstM);
318 : storeMat4_impl(dstM, dst);
319 : }
320 :
321 : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4(const float m1[16], const float m2[16], float dst[16]) {
322 : simde__m128 dstM[4];
323 : simde__m128 tmpM1[4];
324 : simde__m128 tmpM2[4];
325 :
326 : loadMat4_impl(m1, tmpM1);
327 : loadMat4_impl(m2, tmpM2);
328 : addMat4_impl(tmpM1, tmpM2, dstM);
329 : storeMat4_impl(dstM, dst);
330 : }
331 :
332 : SP_ATTR_OPTIMIZE_INLINE_FN inline void subtractMat4(const float m1[16], const float m2[16], float dst[16]) {
333 : simde__m128 dstM[4];
334 : simde__m128 tmpM1[4];
335 : simde__m128 tmpM2[4];
336 :
337 : loadMat4_impl(m1, tmpM1);
338 : loadMat4_impl(m2, tmpM2);
339 : subtractMat4_impl(tmpM1, tmpM2, dstM);
340 : storeMat4_impl(dstM, dst);
341 : }
342 :
343 : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4Scalar(const float m[16], float scalar, float dst[16]) {
344 : simde__m128 dstM[4];
345 : simde__m128 tmpM[4];
346 :
347 : loadMat4_impl(m, tmpM);
348 : multiplyMat4Scalar_impl(tmpM, scalar, dstM);
349 : storeMat4_impl(dstM, dst);
350 : }
351 :
352 : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4(const float m1[16], const float m2[16], float dst[16]) {
353 : simde__m128 dstM[4];
354 : simde__m128 tmpM1[4];
355 : simde__m128 tmpM2[4];
356 :
357 : loadMat4_impl(m1, tmpM1);
358 : loadMat4_impl(m2, tmpM2);
359 : multiplyMat4_impl(tmpM1, tmpM2, dstM);
360 : storeMat4_impl(dstM, dst);
361 : }
362 :
363 : SP_ATTR_OPTIMIZE_INLINE_FN inline void negateMat4(const float m[16], float dst[16]) {
364 : simde__m128 dstM[4];
365 : simde__m128 tmpM[4];
366 :
367 : loadMat4_impl(m, tmpM);
368 : negateMat4_impl(tmpM, dstM);
369 : storeMat4_impl(dstM, dst);
370 : }
371 :
372 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transposeMat4(const float m[16], float dst[16]) {
373 : simde__m128 dstM[4];
374 : simde__m128 tmpM[4];
375 :
376 : loadMat4_impl(m, tmpM);
377 : transposeMat4_impl(tmpM, dstM);
378 : storeMat4_impl(dstM, dst);
379 : }
380 :
381 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4Components(const float m[16], float x, float y, float z, float w, float dst[4]) {
382 : simde__m128 tmpM[4];
383 : simde__m128 dstV;
384 : loadMat4_impl(m, tmpM);
385 :
386 : transformVec4Components_impl(tmpM, x, y, z, w, dstV);
387 : simde_mm_store_ps((simde_float32 *)dst, dstV);
388 : }
389 :
390 : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4(const float m[16], const float v[4], float dst[4]) {
391 : simde__m128 tmpM[4];
392 : simde__m128 dstV;
393 : loadMat4_impl(m, tmpM);
394 :
395 : transformVec4_impl(tmpM, simde_mm_load_ps(v), dstV);
396 : simde_mm_store_ps((simde_float32 *)dst, dstV);
397 : }
398 :
399 : #endif
400 :
401 : SP_ATTR_OPTIMIZE_INLINE_FN inline void crossVec3(const float v1[3], const float v2[3], float dst[3]) {
402 200 : const float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
403 200 : const float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
404 200 : const float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
405 :
406 200 : dst[0] = x;
407 200 : dst[1] = y;
408 200 : dst[2] = z;
409 : }
410 :
411 : // input for test A->B vs C->D (ax, ay, bx, by), (cx, cy, dx, dy)
412 : SP_ATTR_OPTIMIZE_INLINE_FN inline bool isVec2BboxIntersects(const f32x4 & v1, const f32x4 & v2, f32x4 &isect) {
413 : struct alignas(16) data_t {
414 : float data[4];
415 : } ret;
416 :
417 : simde__m128 v1vec = simde_mm_movelh_ps(v1, v2); // (ax, ay, cx, cy)
418 : simde__m128 v2vec = simde_mm_movehl_ps(v2, v1); // (bx, by, dx, dy)
419 :
420 : simde__m128 minVec = simde_mm_min_ps(v1vec, v2vec);
421 : simde__m128 maxVec = simde_mm_max_ps(v1vec, v2vec);
422 :
423 : isect = simde_mm_sub_ps(v2vec, v1vec);
424 :
425 : simde_mm_store_ps(ret.data, simde_mm_sub_ps(
426 : simde_mm_sub_ps(maxVec, minVec),
427 : simde_mm_sub_ps(
428 : simde_mm_movehl_ps(maxVec, minVec),
429 : minVec) ));
430 :
431 63044443 : if (ret.data[0] >= 0.0f && ret.data[1] >= 0.0f && (ret.data[0] != 0.0f || ret.data[1] != 0.0f)) {
432 : return true;
433 : }
434 : return false;
435 : }
436 :
437 : }
438 :
439 : #endif /* STAPPLER_GEOM_SPSIMD_SSE_H_ */
|