43 #ifndef AA_AMINO_ARCH_GCC_H
44 #define AA_AMINO_ARCH_GCC_H
46 typedef double aa_vec_2d __attribute__ ((vector_size (16), aligned(16)));
47 typedef double aa_vec_4d __attribute__ ((vector_size (32), aligned(32)));
48 typedef int64_t aa_vec_4d_size __attribute__ ((vector_size (32)));
49 typedef int64_t aa_vec_2d_size __attribute__ ((vector_size (16)));
53 static inline aa_vec_4d
54 aa_vec_4d_shuffle( aa_vec_4d a,
55 int64_t i0, int64_t i1, int64_t i2, int64_t i3 ) {
56 aa_vec_4d_size m = {i0,i1,i2,i3};
57 return __builtin_shuffle(a,m);
60 static inline aa_vec_2d
61 aa_vec_2d_shuffle( aa_vec_2d a,
62 int64_t i0, int64_t i1 ) {
63 aa_vec_2d_size m = {i0,i1};
64 return __builtin_shuffle(a,m);
68 static inline aa_vec_2d
69 aa_vec_2d_swap( aa_vec_2d a ) {
70 return aa_vec_2d_shuffle(a, 1, 0 );
73 static inline aa_vec_4d
74 aa_vec_4d_shuffle2( aa_vec_4d a, aa_vec_4d b,
75 int64_t i0, int64_t i1, int64_t i2, int64_t i3 ) {
76 aa_vec_4d_size m = {i0,i1,i2,i3};
77 return __builtin_shuffle(a,b,m);
82 static inline aa_vec_4d
83 aa_vec_4d_ld(
const double src[4] ) {
84 return *(aa_vec_4d*)src;
89 aa_vec_4d_st(
double dst[4],
const aa_vec_4d src ) {
90 *(aa_vec_4d*)dst = src;
95 static inline aa_vec_2d
96 aa_vec_2d_ld(
const double src[2] ) {
97 return *(aa_vec_2d*)src;
102 aa_vec_2d_st(
double dst[2],
const aa_vec_2d src ) {
103 *(aa_vec_2d*)dst = src;
108 #include "amino/arch/avx.h"
110 #else // generic load/store
114 static inline aa_vec_4d
115 aa_vec_3d_ld(
const double src[3] ) {
116 aa_vec_4d dst = {src[0], src[1], src[2] };
125 aa_vec_3d_st(
double dst[3],
const aa_vec_4d src ) {
134 aa_vec_4d_dot(
const aa_vec_4d a,
const aa_vec_4d b ) {
136 aa_vec_2d y = {sq[2], sq[3]};
137 aa_vec_2d x = {sq[0], sq[1]};
143 static inline aa_vec_4d
144 aa_vec_cross(
const aa_vec_4d a,
const aa_vec_4d b ) {
145 aa_vec_4d tmp = ( a * aa_vec_4d_shuffle(b, 1,2,0,3) -
146 aa_vec_4d_shuffle(a, 1,2,0,3) * b );
148 return aa_vec_4d_shuffle(tmp, 1,2,0,3);
151 void aa_vecm_cross(
const double a[
AA_RESTRICT 3],
const double b[AA_RESTRICT 3],
152 double c[AA_RESTRICT 3] );
157 static inline aa_vec_4d
158 aa_vec_qconj(
const aa_vec_4d q ) {
166 static inline aa_vec_4d
167 aa_vec_qmul(
const aa_vec_4d a,
const aa_vec_4d b ) {
169 vc = ( aa_vec_4d_shuffle( a, 0,2,3,1 ) * aa_vec_4d_shuffle( b, 3,0,2,1) +
170 aa_vec_4d_shuffle( a, 1,3,2,0 ) * aa_vec_4d_shuffle( b, 2,1,3,0) +
171 aa_vec_4d_shuffle( a, 3,1,0,2 ) * aa_vec_4d_shuffle( b, 0,3,1,2) -
172 aa_vec_4d_shuffle( a, 2,0,1,3 ) * aa_vec_4d_shuffle( b, 1,2,0,3) );
181 #define AA_VEC_QMUL_2DB( ax, ay, az, aw, bxy, bzw, rxy, rzw ) { \
182 aa_vec_2d aa_vec_tmp; \
183 aa_vec_tmp = ax*bzw - az*bxy; \
184 aa_vec_tmp[0] = -aa_vec_tmp[0]; \
185 rx_xy = ay*bzw + aw*bxy + aa_vec_2d_swap(aa_vec_tmp); \
186 aa_vec_tmp = ax*bxy + az*bzw; \
187 aa_vec_tmp[0] = -aa_vec_tmp[0]; \
188 rx_wz = aw*bzw - ay*bxy + aa_vec_2d_swap(aa_vec_tmp); \
191 static inline aa_vec_4d
192 aa_vec_vqmul(
const aa_vec_4d v,
const aa_vec_4d q ) {
193 aa_vec_4d t = aa_vec_4d_shuffle(v, 2,0,1,1);
197 y = aa_vec_4d_shuffle(v, 1,2,0,0) * aa_vec_4d_shuffle(q, 2,0,1,0);
198 y += aa_vec_4d_shuffle(v, 0,1,2,1) * aa_vec_4d_shuffle(q, 3,3,3,2);
199 y -= t * aa_vec_4d_shuffle(q, 1,2,0,2);
205 void aa_vecm_qmul(
const double a[
AA_RESTRICT 4],
const double b[AA_RESTRICT 4],
206 double c[AA_RESTRICT 4] );
210 static inline aa_vec_4d
211 aa_vec_qrot(
const aa_vec_4d q,
const aa_vec_4d v ) {
212 aa_vec_4d a = aa_vec_cross(q,v) + q[3]*v;
213 aa_vec_4d b = aa_vec_cross(q,a);
218 void aa_vecm_qrot(
const double q[
AA_RESTRICT 4],
const double v[AA_RESTRICT 3],
219 double p[AA_RESTRICT 3] );
224 static inline aa_vec_4d
225 aa_vec_qv_tf(
const aa_vec_4d q,
const aa_vec_4d v,
const aa_vec_4d p )
227 return aa_vec_qrot(q, p) + v;
231 #define AA_VEC_QV_MUL( q0, v0, q1, v1, qr, vr ) { \
232 qr = aa_vec_qmul(q0,q1); \
233 vr = aa_vec_qv_tf(q0, v0, v1); \
237 #define AA_VEC_ROTMAT_LD( R0, R1, R2, ptr ) { \
238 R0 = aa_vec_3d_ld(ptr); \
239 R1 = aa_vec_3d_ld(ptr+3); \
240 R2 = aa_vec_3d_ld(ptr+6); \
246 aa_vec_4d col0, aa_vec_4d col1, aa_vec_4d col2 )
260 static inline aa_vec_4d
261 aa_vec_rotmat_tf(
const aa_vec_4d R0,
const aa_vec_4d R1,
const aa_vec_4d R2,
const aa_vec_4d p )
263 return R0*p[0] + R1*p[1] + R2*p[2];
267 #define AA_VEC_TFMAT_LD( col0, col1, col2, col3, T ) { \
285 aa_vec_4d col0, aa_vec_4d col1, aa_vec_4d col2, aa_vec_4d col3 ) {
301 static inline aa_vec_4d
302 aa_vec_tfmat_tf(
const aa_vec_4d T0,
const aa_vec_4d T1,
const aa_vec_4d T2,
const aa_vec_4d T3,
305 return aa_vec_rotmat_tf(T0, T1, T2, p) + T3;
309 #define AA_VEC_TFMUL( T0c0, T0c1, T0c2, T0c3, T1, Uc0, Uc1, Uc2, Uc3 ) { \
310 Uc0 = T0c0*T1[0] + T0c1*T1[1] + T0c2*T1[2]; \
311 Uc1 = T0c0*T1[3] + T0c1*T1[4] + T0c2*T1[5]; \
312 Uc2 = T0c0*T1[6] + T0c1*T1[7] + T0c2*T1[8]; \
313 Uc3 = T0c0*T1[9] + T0c1*T1[10] + T0c2*T1[11] + T0c3; \
317 void aa_vecm_tfmul(
const double T0[
AA_RESTRICT 12],
const double T1[AA_RESTRICT 12],
318 double U[AA_RESTRICT 12] );
324 #define AA_VEC_DUQU_MUL( d0r, d0d, d1r, d1d, d2r, d2d ) { \
325 d2r = aa_vec_qmul( d0r, d1r ); \
326 d2d = aa_vec_qmul( d0r, d1d ) + aa_vec_qmul( d0d, d1r ); \
330 void aa_vecm_duqu_mul(
const double d0[
AA_RESTRICT 8],
const double d1[AA_RESTRICT 8],
331 double d2[AA_RESTRICT 8] );
334 static inline aa_vec_4d
335 aa_vec_duqu_trans( aa_vec_4d r, aa_vec_4d d ) {
336 return 2 * aa_vec_qmul( d, aa_vec_qconj(r));
340 static inline aa_vec_4d
341 aa_vec_qv2duqu_dual( aa_vec_4d r, aa_vec_4d d ) {
342 return aa_vec_vqmul( aa_vec_qconj(r), d ) / 2;
345 #endif //AA_AMINO_ARCH_GCC_H
#define AA_RESTRICT
Defined restrict keyword based on language flavor.