LCOV - code coverage report
Current view: top level - core/geom - SPSIMD_Sse.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 39 39 100.0 %
Date: 2024-05-12 00:16:13 Functions: 0 0 -

          Line data    Source code
       1             : /**
       2             : Copyright (c) 2022 Roman Katuntsev <sbkarr@stappler.org>
       3             : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
       4             : 
       5             : Permission is hereby granted, free of charge, to any person obtaining a copy
       6             : of this software and associated documentation files (the "Software"), to deal
       7             : in the Software without restriction, including without limitation the rights
       8             : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
       9             : copies of the Software, and to permit persons to whom the Software is
      10             : furnished to do so, subject to the following conditions:
      11             : 
      12             : The above copyright notice and this permission notice shall be included in
      13             : all copies or substantial portions of the Software.
      14             : 
      15             : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      16             : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      17             : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      18             : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      19             : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
      20             : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
      21             : THE SOFTWARE.
      22             : **/
      23             : 
      24             : // Excluded from documentation/codegen tool
      25             : ///@ SP_EXCLUDE
      26             : 
      27             : #ifndef STAPPLER_GEOM_SPSIMD_SSE_H_
      28             : #define STAPPLER_GEOM_SPSIMD_SSE_H_
      29             : 
      30             : #include "SPSIMD.h"
      31             : #include "simde/x86/sse.h"
      32             : 
      33             : #if __SSE__
      34             : #define SP_SIMD_SSE_STORE_VEC4(vec, value)      *((simde__m128 *)&vec.x) = (value)
      35             : #define SP_SIMD_SSE_LOAD_VEC4(vec)                      *((simde__m128 *)(&vec.x))
      36             : #else
      37             : #define SP_SIMD_SSE_STORE_VEC4(vec, value)      simde_mm_store_ps(&vec.x, value)
      38             : #define SP_SIMD_SSE_LOAD_VEC4(vec)                      simde_mm_load_ps(&vec.x)
      39             : #endif
      40             : 
      41             : namespace STAPPLER_VERSIONIZED stappler::simd::sse {
      42             : 
      43             : using f32x4 = simde__m128;
      44             : 
      45             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load(float v1, float v2, float v3, float v4) {
      46             :         return simde_mm_set_ps(v4, v3, v2, v1);
      47             : }
      48             : 
      49             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load(const float v[4]) {
      50             :         return simde_mm_load_ps(v);
      51             : }
      52             : 
      53             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load(float v) {
      54             :         return simde_mm_load1_ps(&v);
      55             : }
      56             : 
      57             : SP_ATTR_OPTIMIZE_INLINE_FN inline void store(float target[4], const f32x4 &v) {
      58      141337 :         simde_mm_store_ps(target, v);
      59             : }
      60             : 
      61             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 mul(const f32x4 &v1, const f32x4 &v2) {
      62       98478 :         return simde_mm_mul_ps(v1, v2);
      63             : }
      64             : 
      65             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 div(const f32x4 &v1, const f32x4 &v2) {
      66             :         return simde_mm_div_ps(v1, v2);
      67             : }
      68             : 
      69             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 add(const f32x4 &v1, const f32x4 &v2) {
      70       43585 :         return simde_mm_add_ps(v1, v2);
      71             : }
      72             : 
      73             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 sub(const f32x4 &v1, const f32x4 &v2) {
      74             :         return simde_mm_sub_ps(v1, v2);
      75             : }
      76             : 
      77             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 rsqrt(const f32x4 &v) {
      78             :         return simde_mm_rsqrt_ps(v);
      79             : }
      80             : 
      81             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 load1(float v) {
      82             :         return simde_mm_load_ss(&v);
      83             : }
      84             : 
      85             : SP_ATTR_OPTIMIZE_INLINE_FN inline void store1(float *target, const f32x4 &v) {
      86             :         simde_mm_store_ss(target, v);
      87             : }
      88             : 
      89             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 mul1(const f32x4 &v1, const f32x4 &v2) {
      90             :         return simde_mm_mul_ss(v1, v2);
      91             : }
      92             : 
      93             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 add1(const f32x4 &v1, const f32x4 &v2) {
      94             :         return simde_mm_add_ss(v1, v2);
      95             : }
      96             : 
      97             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 sub1(const f32x4 &v1, const f32x4 &v2) {
      98             :         return simde_mm_sub_ss(v1, v2);
      99             : }
     100             : 
     101             : SP_ATTR_OPTIMIZE_INLINE_FN inline f32x4 rsqrt1(const f32x4 &v) {
     102             :         return simde_mm_rsqrt_ss(v);
     103             : }
     104             : 
     105             : [[maybe_unused]] SP_ATTR_OPTIMIZE_INLINE_FN inline void loadMat4_impl(const float m[16], simde__m128 dst[4]) {
     106             :         dst[0] = simde_mm_load_ps(&m[0]);
     107             :         dst[1] = simde_mm_load_ps(&m[4]);
     108             :         dst[2] = simde_mm_load_ps(&m[8]);
     109             :         dst[3] = simde_mm_load_ps(&m[12]);
     110             : }
     111             : 
     112             : [[maybe_unused]] SP_ATTR_OPTIMIZE_INLINE_FN inline void storeMat4_impl(const simde__m128 m[4], float dst[16]) {
     113             :         simde_mm_store_ps((simde_float32 *)&dst[0], m[0]);
     114             :         simde_mm_store_ps((simde_float32 *)&dst[4], m[1]);
     115             :         simde_mm_store_ps((simde_float32 *)&dst[8], m[2]);
     116             :         simde_mm_store_ps((simde_float32 *)&dst[12], m[3]);
     117             : }
     118             : 
     119             : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4Scalar_impl(const simde__m128 *m, float scalar, simde__m128 *dst) {
     120             :         auto s = simde_mm_set1_ps(scalar);
     121             :         dst[0] = simde_mm_add_ps(m[0], s);
     122             :         dst[1] = simde_mm_add_ps(m[1], s);
     123             :         dst[2] = simde_mm_add_ps(m[2], s);
     124             :         dst[3] = simde_mm_add_ps(m[3], s);
     125             : }
     126             : 
     127             : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4_impl(const simde__m128 *m1, const simde__m128 *m2, simde__m128 *dst) {
     128             :         dst[0] = simde_mm_add_ps(m1[0], m2[0]);
     129             :         dst[1] = simde_mm_add_ps(m1[1], m2[1]);
     130             :         dst[2] = simde_mm_add_ps(m1[2], m2[2]);
     131             :         dst[3] = simde_mm_add_ps(m1[3], m2[3]);
     132             : }
     133             : 
     134             : SP_ATTR_OPTIMIZE_INLINE_FN inline void subtractMat4_impl(const simde__m128 *m1, const simde__m128 *m2, simde__m128 *dst) {
     135             :         dst[0] = simde_mm_sub_ps(m1[0], m2[0]);
     136             :         dst[1] = simde_mm_sub_ps(m1[1], m2[1]);
     137             :         dst[2] = simde_mm_sub_ps(m1[2], m2[2]);
     138             :         dst[3] = simde_mm_sub_ps(m1[3], m2[3]);
     139             : }
     140             : 
     141             : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4Scalar_impl(const simde__m128 *m, float scalar, simde__m128 *dst) {
     142             :         auto s = simde_mm_set1_ps(scalar);
     143       82100 :         dst[0] = simde_mm_mul_ps(m[0], s);
     144       82100 :         dst[1] = simde_mm_mul_ps(m[1], s);
     145       82100 :         dst[2] = simde_mm_mul_ps(m[2], s);
     146       82100 :         dst[3] = simde_mm_mul_ps(m[3], s);
     147             : }
     148             : 
     149             : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4_impl(const simde__m128 m1[4], const simde__m128 m2[4], simde__m128 dst[4]) {
     150             :         simde__m128 dst0, dst1, dst2, dst3;
     151             :         {
     152     2697115 :                 simde__m128 e0 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
     153     2697366 :                 simde__m128 e1 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
     154     2697364 :                 simde__m128 e2 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
     155     2697369 :                 simde__m128 e3 = simde_mm_shuffle_ps(m2[0], m2[0], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
     156             : 
     157     2697313 :                 simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
     158     2697313 :                 simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
     159     2697313 :                 simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
     160     2697313 :                 simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
     161             : 
     162             :                 simde__m128 a0 = simde_mm_add_ps(v0, v1);
     163             :                 simde__m128 a1 = simde_mm_add_ps(v2, v3);
     164             :                 simde__m128 a2 = simde_mm_add_ps(a0, a1);
     165             : 
     166             :                 dst0 = a2;
     167             :         }
     168             : 
     169             :         {
     170     2697313 :                 simde__m128 e0 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
     171     2697381 :                 simde__m128 e1 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
     172     2697383 :                 simde__m128 e2 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
     173     2697383 :                 simde__m128 e3 = simde_mm_shuffle_ps(m2[1], m2[1], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
     174             : 
     175             :                 simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
     176             :                 simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
     177             :                 simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
     178             :                 simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
     179             : 
     180             :                 simde__m128 a0 = simde_mm_add_ps(v0, v1);
     181             :                 simde__m128 a1 = simde_mm_add_ps(v2, v3);
     182             :                 simde__m128 a2 = simde_mm_add_ps(a0, a1);
     183             : 
     184             :                 dst1 = a2;
     185             :         }
     186             : 
     187             :         {
     188     2697373 :                 simde__m128 e0 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
     189     2697387 :                 simde__m128 e1 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
     190     2697387 :                 simde__m128 e2 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
     191     2697390 :                 simde__m128 e3 = simde_mm_shuffle_ps(m2[2], m2[2], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
     192             : 
     193             :                 simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
     194             :                 simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
     195             :                 simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
     196             :                 simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
     197             : 
     198             :                 simde__m128 a0 = simde_mm_add_ps(v0, v1);
     199             :                 simde__m128 a1 = simde_mm_add_ps(v2, v3);
     200             :                 simde__m128 a2 = simde_mm_add_ps(a0, a1);
     201             : 
     202             :                 dst2 = a2;
     203             :         }
     204             : 
     205             :         {
     206     2697327 :                 simde__m128 e0 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(0, 0, 0, 0));
     207     2697353 :                 simde__m128 e1 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(1, 1, 1, 1));
     208     2697357 :                 simde__m128 e2 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(2, 2, 2, 2));
     209     2697359 :                 simde__m128 e3 = simde_mm_shuffle_ps(m2[3], m2[3], SIMDE_MM_SHUFFLE(3, 3, 3, 3));
     210             : 
     211             :                 simde__m128 v0 = simde_mm_mul_ps(m1[0], e0);
     212             :                 simde__m128 v1 = simde_mm_mul_ps(m1[1], e1);
     213             :                 simde__m128 v2 = simde_mm_mul_ps(m1[2], e2);
     214             :                 simde__m128 v3 = simde_mm_mul_ps(m1[3], e3);
     215             : 
     216             :                 simde__m128 a0 = simde_mm_add_ps(v0, v1);
     217             :                 simde__m128 a1 = simde_mm_add_ps(v2, v3);
     218             :                 simde__m128 a2 = simde_mm_add_ps(a0, a1);
     219             : 
     220             :                 dst3 = a2;
     221             :         }
     222     2697348 :         dst[0] = dst0;
     223     2697348 :         dst[1] = dst1;
     224     2697348 :         dst[2] = dst2;
     225     2697348 :         dst[3] = dst3;
     226             : }
     227             : 
     228             : SP_ATTR_OPTIMIZE_INLINE_FN inline void negateMat4_impl(const simde__m128 m[4], simde__m128 dst[4]) {
     229             :         simde__m128 z = simde_mm_setzero_ps();
     230             :         dst[0] = simde_mm_sub_ps(z, m[0]);
     231             :         dst[1] = simde_mm_sub_ps(z, m[1]);
     232             :         dst[2] = simde_mm_sub_ps(z, m[2]);
     233             :         dst[3] = simde_mm_sub_ps(z, m[3]);
     234             : }
     235             : 
     236             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transposeMat4_impl(const simde__m128 m[4], simde__m128 dst[4]) {
     237             :         simde__m128 tmp0 = simde_mm_shuffle_ps(m[0], m[1], 0x44);
     238             :         simde__m128 tmp2 = simde_mm_shuffle_ps(m[0], m[1], 0xEE);
     239             :         simde__m128 tmp1 = simde_mm_shuffle_ps(m[2], m[3], 0x44);
     240             :         simde__m128 tmp3 = simde_mm_shuffle_ps(m[2], m[3], 0xEE);
     241             : 
     242             :         dst[0] = simde_mm_shuffle_ps(tmp0, tmp1, 0x88);
     243             :         dst[1] = simde_mm_shuffle_ps(tmp0, tmp1, 0xDD);
     244             :         dst[2] = simde_mm_shuffle_ps(tmp2, tmp3, 0x88);
     245             :         dst[3] = simde_mm_shuffle_ps(tmp2, tmp3, 0xDD);
     246             : }
     247             : 
     248             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4Components_impl(const simde__m128 m[4], float x, float y, float z, float w, simde__m128& dst) {
     249             :         simde__m128 col1 = simde_mm_set1_ps(x);
     250             :         simde__m128 col2 = simde_mm_set1_ps(y);
     251             :         simde__m128 col3 = simde_mm_set1_ps(z);
     252             :         simde__m128 col4 = simde_mm_set1_ps(w);
     253             : 
     254        7009 :         dst = simde_mm_add_ps(
     255             :                         simde_mm_add_ps(simde_mm_mul_ps(m[0], col1), simde_mm_mul_ps(m[1], col2)),
     256             :                         simde_mm_add_ps(simde_mm_mul_ps(m[2], col3), simde_mm_mul_ps(m[3], col4))
     257             :         );
     258             : }
     259             : 
     260             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4_impl(const simde__m128 m[4], const simde__m128 &v, simde__m128& dst) {
     261             :         simde__m128 col1 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(0, 0, 0, 0));
     262             :         simde__m128 col2 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(1, 1, 1, 1));
     263             :         simde__m128 col3 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(2, 2, 2, 2));
     264             :         simde__m128 col4 = simde_mm_shuffle_ps(v, v, SIMDE_MM_SHUFFLE(3, 3, 3, 3));
     265             : 
     266             :         dst = simde_mm_add_ps(
     267             :                 simde_mm_add_ps(simde_mm_mul_ps(m[0], col1), simde_mm_mul_ps(m[1], col2)),
     268             :                 simde_mm_add_ps(simde_mm_mul_ps(m[2], col3), simde_mm_mul_ps(m[3], col4))
     269             :         );
     270             : }
     271             : 
     272             : #if SP_GEOM_DEFAULT_SIMD == SP_GEOM_DEFAULT_SIMD_SSE
     273             : 
     274             : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4Scalar(const float m[16], float scalar, float dst[16]) {
     275             :         addMat4Scalar_impl((const simde__m128 *)m, scalar, (simde__m128 *)dst);
     276             : }
     277             : 
     278             : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4(const float m1[16], const float m2[16], float dst[16]) {
     279             :         addMat4_impl((const simde__m128 *)m1, (const simde__m128 *)m2, (simde__m128 *)dst);
     280             : }
     281             : 
     282             : SP_ATTR_OPTIMIZE_INLINE_FN inline void subtractMat4(const float m1[16], const float m2[16], float dst[16]) {
     283             :         subtractMat4_impl((const simde__m128 *)m1, (const simde__m128 *)m2, (simde__m128 *)dst);
     284             : }
     285             : 
     286             : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4Scalar(const float m[16], float scalar, float dst[16]) {
     287             :         multiplyMat4Scalar_impl((const simde__m128 *)m, scalar, (simde__m128 *)dst);
     288             : }
     289             : 
     290             : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4(const float m1[16], const float m2[16], float dst[16]) {
     291             :         multiplyMat4_impl((const simde__m128 *)m1, (const simde__m128 *)m2, (simde__m128 *)dst);
     292             : }
     293             : 
     294             : SP_ATTR_OPTIMIZE_INLINE_FN inline void negateMat4(const float m[16], float dst[16]) {
     295             :         negateMat4_impl((const simde__m128 *)m, (simde__m128 *)dst);
     296             : }
     297             : 
     298             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transposeMat4(const float m[16], float dst[16]) {
     299             :         transposeMat4_impl((const simde__m128 *)m, (simde__m128 *)dst);
     300             : }
     301             : 
     302             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4Components(const float m[16], float x, float y, float z, float w, float dst[4]) {
     303             :         transformVec4Components_impl((const simde__m128 *)m, x, y, z, w, *(simde__m128 *)dst);
     304             : }
     305             : 
     306             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4(const float m[16], const float v[4], float dst[4]) {
     307             :         transformVec4_impl((const simde__m128 *)m, *(const simde__m128 *)v, *(simde__m128 *)dst);
     308             : }
     309             : 
     310             : #else
     311             : 
     312             : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4Scalar(const float m[16], float scalar, float dst[16]) {
     313             :         simde__m128 dstM[4];
     314             :         simde__m128 tmpM[4];
     315             : 
     316             :         loadMat4_impl(m, tmpM);
     317             :         addMat4Scalar_impl(tmpM, scalar, dstM);
     318             :         storeMat4_impl(dstM, dst);
     319             : }
     320             : 
     321             : SP_ATTR_OPTIMIZE_INLINE_FN inline void addMat4(const float m1[16], const float m2[16], float dst[16]) {
     322             :         simde__m128 dstM[4];
     323             :         simde__m128 tmpM1[4];
     324             :         simde__m128 tmpM2[4];
     325             : 
     326             :         loadMat4_impl(m1, tmpM1);
     327             :         loadMat4_impl(m2, tmpM2);
     328             :         addMat4_impl(tmpM1, tmpM2, dstM);
     329             :         storeMat4_impl(dstM, dst);
     330             : }
     331             : 
     332             : SP_ATTR_OPTIMIZE_INLINE_FN inline void subtractMat4(const float m1[16], const float m2[16], float dst[16]) {
     333             :         simde__m128 dstM[4];
     334             :         simde__m128 tmpM1[4];
     335             :         simde__m128 tmpM2[4];
     336             : 
     337             :         loadMat4_impl(m1, tmpM1);
     338             :         loadMat4_impl(m2, tmpM2);
     339             :         subtractMat4_impl(tmpM1, tmpM2, dstM);
     340             :         storeMat4_impl(dstM, dst);
     341             : }
     342             : 
     343             : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4Scalar(const float m[16], float scalar, float dst[16]) {
     344             :         simde__m128 dstM[4];
     345             :         simde__m128 tmpM[4];
     346             : 
     347             :         loadMat4_impl(m, tmpM);
     348             :         multiplyMat4Scalar_impl(tmpM, scalar, dstM);
     349             :         storeMat4_impl(dstM, dst);
     350             : }
     351             : 
     352             : SP_ATTR_OPTIMIZE_INLINE_FN inline void multiplyMat4(const float m1[16], const float m2[16], float dst[16]) {
     353             :         simde__m128 dstM[4];
     354             :         simde__m128 tmpM1[4];
     355             :         simde__m128 tmpM2[4];
     356             : 
     357             :         loadMat4_impl(m1, tmpM1);
     358             :         loadMat4_impl(m2, tmpM2);
     359             :         multiplyMat4_impl(tmpM1, tmpM2, dstM);
     360             :         storeMat4_impl(dstM, dst);
     361             : }
     362             : 
     363             : SP_ATTR_OPTIMIZE_INLINE_FN inline void negateMat4(const float m[16], float dst[16]) {
     364             :         simde__m128 dstM[4];
     365             :         simde__m128 tmpM[4];
     366             : 
     367             :         loadMat4_impl(m, tmpM);
     368             :         negateMat4_impl(tmpM, dstM);
     369             :         storeMat4_impl(dstM, dst);
     370             : }
     371             : 
     372             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transposeMat4(const float m[16], float dst[16]) {
     373             :         simde__m128 dstM[4];
     374             :         simde__m128 tmpM[4];
     375             : 
     376             :         loadMat4_impl(m, tmpM);
     377             :         transposeMat4_impl(tmpM, dstM);
     378             :         storeMat4_impl(dstM, dst);
     379             : }
     380             : 
     381             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4Components(const float m[16], float x, float y, float z, float w, float dst[4]) {
     382             :         simde__m128 tmpM[4];
     383             :         simde__m128 dstV;
     384             :         loadMat4_impl(m, tmpM);
     385             : 
     386             :         transformVec4Components_impl(tmpM, x, y, z, w, dstV);
     387             :         simde_mm_store_ps((simde_float32 *)dst, dstV);
     388             : }
     389             : 
     390             : SP_ATTR_OPTIMIZE_INLINE_FN inline void transformVec4(const float m[16], const float v[4], float dst[4]) {
     391             :         simde__m128 tmpM[4];
     392             :         simde__m128 dstV;
     393             :         loadMat4_impl(m, tmpM);
     394             : 
     395             :         transformVec4_impl(tmpM, simde_mm_load_ps(v), dstV);
     396             :         simde_mm_store_ps((simde_float32 *)dst, dstV);
     397             : }
     398             : 
     399             : #endif
     400             : 
     401             : SP_ATTR_OPTIMIZE_INLINE_FN inline void crossVec3(const float v1[3], const float v2[3], float dst[3]) {
     402         200 :         const float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
     403         200 :         const float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
     404         200 :         const float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
     405             : 
     406         200 :         dst[0] = x;
     407         200 :         dst[1] = y;
     408         200 :         dst[2] = z;
     409             : }
     410             : 
     411             : // input for test A->B vs C->D (ax, ay, bx, by), (cx, cy, dx, dy)
     412             : SP_ATTR_OPTIMIZE_INLINE_FN inline bool isVec2BboxIntersects(const f32x4 & v1, const f32x4 & v2, f32x4 &isect) {
     413             :         struct alignas(16) data_t {
     414             :             float data[4];
     415             :         } ret;
     416             : 
     417             :         simde__m128 v1vec = simde_mm_movelh_ps(v1, v2); // (ax, ay, cx, cy)
     418             :         simde__m128 v2vec = simde_mm_movehl_ps(v2, v1); // (bx, by, dx, dy)
     419             : 
     420             :         simde__m128 minVec = simde_mm_min_ps(v1vec, v2vec);
     421             :         simde__m128 maxVec = simde_mm_max_ps(v1vec, v2vec);
     422             : 
     423             :         isect = simde_mm_sub_ps(v2vec, v1vec);
     424             : 
     425             :         simde_mm_store_ps(ret.data, simde_mm_sub_ps(
     426             :                         simde_mm_sub_ps(maxVec, minVec),
     427             :                         simde_mm_sub_ps(
     428             :                                         simde_mm_movehl_ps(maxVec, minVec),
     429             :                                         minVec) ));
     430             : 
     431    63044443 :         if (ret.data[0] >= 0.0f && ret.data[1] >= 0.0f && (ret.data[0] != 0.0f || ret.data[1] != 0.0f)) {
     432             :                 return true;
     433             :         }
     434             :         return false;
     435             : }
     436             : 
     437             : }
     438             : 
     439             : #endif /* STAPPLER_GEOM_SPSIMD_SSE_H_ */

Generated by: LCOV version 1.14