From 2d185253c12659ccc877f1d8e1b47b46f9d50f16 Mon Sep 17 00:00:00 2001 From: msnh2012 Date: Thu, 13 Aug 2020 12:24:41 +0800 Subject: [PATCH] clear --- include/Msnhnet/core/MsnhnetAvxMathEx.h | 727 ----------------------- include/Msnhnet/core/MsnhnetNeonMathEx.h | 424 ------------- 2 files changed, 1151 deletions(-) delete mode 100644 include/Msnhnet/core/MsnhnetAvxMathEx.h delete mode 100644 include/Msnhnet/core/MsnhnetNeonMathEx.h diff --git a/include/Msnhnet/core/MsnhnetAvxMathEx.h b/include/Msnhnet/core/MsnhnetAvxMathEx.h deleted file mode 100644 index 5b4cce3e..00000000 --- a/include/Msnhnet/core/MsnhnetAvxMathEx.h +++ /dev/null @@ -1,727 +0,0 @@ -#ifndef MSNHNETAVXMATHEX_H -#define MSNHNETAVXMATHEX_H -/* - AVX implementation of sin, cos, sincos, exp and log - Based on "sse_mathfun.h", by Julien Pommier - http://gruntthepeon.free.fr/ssemath/ - Copyright (C) 2012 Giovanni Garberoglio - Interdisciplinary Laboratory for Computational Science (LISC) - Fondazione Bruno Kessler and University of Trento - via Sommarive, 18 - I-38123 Trento (Italy) - This software is provided 'as-is', without any express or implied - warranty. In no event will the authors be held liable for any damages - arising from the use of this software. - Permission is granted to anyone to use this software for any purpose, - including commercial applications, and to alter it and redistribute it - freely, subject to the following restrictions: - 1. The origin of this software must not be misrepresented; you must not - claim that you wrote the original software. If you use this software - in a product, an acknowledgment in the product documentation would be - appreciated but is not required. - 2. Altered source versions must be plainly marked as such, and must not be - misrepresented as being the original software. - 3. This notice may not be removed or altered from any source distribution. - (this is the zlib license) -*/ -#ifdef USE_X86 -#ifdef WIN32 -#include -#include -#include -#include -#else -#include -#include -#include -#include -#include -#endif -#include - -#define __AVX2__ - -/* yes I know, the top of this file is quite ugly */ -#if defined(__GNUC__) -#define ALIGN32_BEG __attribute__((aligned(32))) -#elif defined(_WIN32) -#define ALIGN32_BEG __declspec(align(32)) -#endif - -#define _PI32AVX_CONST(Name, Val) \ - static const ALIGN32_BEG int _pi32avx_##Name[4] = {Val, Val, Val, Val} - -_PI32AVX_CONST(1, 1); -_PI32AVX_CONST(inv1, ~1); -_PI32AVX_CONST(2, 2); -_PI32AVX_CONST(4, 4); - -/* declare some AVX constants -- why can't I figure a better way to do that? */ -#define _PS256_CONST(Name, Val) \ - static const ALIGN32_BEG float _ps256_##Name[8] = {Val, Val, Val, Val, Val, Val, Val, Val} -#define _PI32_CONST256(Name, Val) \ - static const ALIGN32_BEG int _pi32_256_##Name[8] = {Val, Val, Val, Val, Val, Val, Val, Val} -#define _PS256_CONST_TYPE(Name, Type, Val) \ - static const ALIGN32_BEG Type _ps256_##Name[8] = {Val, Val, Val, Val, Val, Val, Val, Val} - -_PS256_CONST(1, 1.0f); -_PS256_CONST(0p5, 0.5f); -/* the smallest non denormalized float number */ -_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); -_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); -_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); - -_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); -_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); - -_PI32_CONST256(0, 0); -_PI32_CONST256(1, 1); -_PI32_CONST256(inv1, ~1); -_PI32_CONST256(2, 2); -_PI32_CONST256(4, 4); -_PI32_CONST256(0x7f, 0x7f); - -_PS256_CONST(cephes_SQRTHF, 0.707106781186547524f); -_PS256_CONST(cephes_log_p0, 7.0376836292E-2f); -_PS256_CONST(cephes_log_p1, -1.1514610310E-1f); -_PS256_CONST(cephes_log_p2, 1.1676998740E-1f); -_PS256_CONST(cephes_log_p3, -1.2420140846E-1f); -_PS256_CONST(cephes_log_p4, +1.4249322787E-1f); -_PS256_CONST(cephes_log_p5, -1.6668057665E-1f); -_PS256_CONST(cephes_log_p6, +2.0000714765E-1f); -_PS256_CONST(cephes_log_p7, -2.4999993993E-1f); -_PS256_CONST(cephes_log_p8, +3.3333331174E-1f); -_PS256_CONST(cephes_log_q1, -2.12194440e-4f); -_PS256_CONST(cephes_log_q2, 0.693359375f); - -#ifndef __AVX2__ - -typedef union imm_xmm_union -{ - __m256i imm; - __m128i xmm[2]; -} imm_xmm_union; - -#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ - { \ - imm_xmm_union u __attribute__((aligned(32))); \ - u.imm = imm_; \ - xmm0_ = u.xmm[0]; \ - xmm1_ = u.xmm[1]; \ - } - -#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ - { \ - imm_xmm_union u __attribute__((aligned(32))); \ - u.xmm[0] = xmm0_; \ - u.xmm[1] = xmm1_; \ - imm_ = u.imm; \ - } - -#define AVX2_BITOP_USING_SSE2(fn) \ - static inline __m256i _mm256_##fn(__m256i x, int a) \ - { \ - /* use SSE2 instruction to perform the bitop AVX2 */ \ - __m128i x1, x2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - x1 = _mm_##fn(x1, a); \ - x2 = _mm_##fn(x2, a); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return (ret); \ - } - -#warning "Using SSE2 to perform AVX2 bitshift ops" -AVX2_BITOP_USING_SSE2(slli_epi32) -AVX2_BITOP_USING_SSE2(srli_epi32) - -#define AVX2_INTOP_USING_SSE2(fn) \ - static inline __m256i _mm256_##fn(__m256i x, __m256i y) \ - { \ - /* use SSE2 instructions to perform the AVX2 integer operation */ \ - __m128i x1, x2; \ - __m128i y1, y2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - COPY_IMM_TO_XMM(y, y1, y2); \ - x1 = _mm_##fn(x1, y1); \ - x2 = _mm_##fn(x2, y2); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return (ret); \ - } - -#warning "Using SSE2 to perform AVX2 integer ops" -AVX2_INTOP_USING_SSE2(and_si128) -AVX2_INTOP_USING_SSE2(andnot_si128) -AVX2_INTOP_USING_SSE2(cmpeq_epi32) -AVX2_INTOP_USING_SSE2(sub_epi32) -AVX2_INTOP_USING_SSE2(add_epi32) - -#endif /* __AVX2__ */ - -/* natural logarithm computed for 8 simultaneous float - return NaN for x <= 0 -*/ -static inline __m256 log256_ps(__m256 x) -{ - __m256i imm0; - __m256 one = *(__m256*)_ps256_1; - - __m256 invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); - - x = _mm256_max_ps(x, *(__m256*)_ps256_min_norm_pos); /* cut off denormalized stuff */ - - imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); - - /* keep only the fractional part */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_mant_mask); - x = _mm256_or_ps(x, *(__m256*)_ps256_0p5); - - imm0 = _mm256_sub_epi32(imm0, *(__m256i*)_pi32_256_0x7f); - __m256 e = _mm256_cvtepi32_ps(imm0); - - e = _mm256_add_ps(e, one); - - /* part2: - if( x < SQRTHF ) { - e -= 1; - x = x + x - 1.0; - } else { x = x - 1.0; } - */ - - __m256 mask = _mm256_cmp_ps(x, *(__m256*)_ps256_cephes_SQRTHF, _CMP_LT_OS); - __m256 tmp = _mm256_and_ps(x, mask); - x = _mm256_sub_ps(x, one); - e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); - x = _mm256_add_ps(x, tmp); - - __m256 z = _mm256_mul_ps(x, x); - - __m256 y = *(__m256*)_ps256_cephes_log_p0; - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p1); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p2); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p3); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p4); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p5); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p6); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p7); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_log_p8); - y = _mm256_mul_ps(y, x); - - y = _mm256_mul_ps(y, z); - - tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q1); - y = _mm256_add_ps(y, tmp); - - tmp = _mm256_mul_ps(z, *(__m256*)_ps256_0p5); - y = _mm256_sub_ps(y, tmp); - - tmp = _mm256_mul_ps(e, *(__m256*)_ps256_cephes_log_q2); - x = _mm256_add_ps(x, y); - x = _mm256_add_ps(x, tmp); - y = _mm256_or_ps(x, invalid_mask); - - return y; -} - -_PS256_CONST(exp_hi, 88.3762626647949f); -_PS256_CONST(exp_lo, -88.3762626647949f); - -_PS256_CONST(cephes_LOG2EF, 1.44269504088896341f); -_PS256_CONST(cephes_exp_C1, 0.693359375f); -_PS256_CONST(cephes_exp_C2, -2.12194440e-4f); - -_PS256_CONST(cephes_exp_p0, 1.9875691500E-4f); -_PS256_CONST(cephes_exp_p1, 1.3981999507E-3f); -_PS256_CONST(cephes_exp_p2, 8.3334519073E-3f); -_PS256_CONST(cephes_exp_p3, 4.1665795894E-2f); -_PS256_CONST(cephes_exp_p4, 1.6666665459E-1f); -_PS256_CONST(cephes_exp_p5, 5.0000001201E-1f); - -static inline __m256 exp256_ps(__m256 x) -{ - __m256 tmp = _mm256_setzero_ps(), fx; - __m256i imm0; - __m256 one = *(__m256*)_ps256_1; - - x = _mm256_min_ps(x, *(__m256*)_ps256_exp_hi); - x = _mm256_max_ps(x, *(__m256*)_ps256_exp_lo); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_LOG2EF); - fx = _mm256_add_ps(fx, *(__m256*)_ps256_0p5); - - /* how to perform a floorf with SSE: just below */ - - tmp = _mm256_floor_ps(fx); - - /* if greater, subtract 1 */ - - __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); - mask = _mm256_and_ps(mask, one); - fx = _mm256_sub_ps(tmp, mask); - - tmp = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C1); - __m256 z = _mm256_mul_ps(fx, *(__m256*)_ps256_cephes_exp_C2); - x = _mm256_sub_ps(x, tmp); - x = _mm256_sub_ps(x, z); - - z = _mm256_mul_ps(x, x); - - __m256 y = *(__m256*)_ps256_cephes_exp_p0; - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p1); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p2); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p3); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p4); - y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(__m256*)_ps256_cephes_exp_p5); - y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, x); - y = _mm256_add_ps(y, one); - - /* build 2^n */ - imm0 = _mm256_cvttps_epi32(fx); - - imm0 = _mm256_add_epi32(imm0, *(__m256i*)_pi32_256_0x7f); - imm0 = _mm256_slli_epi32(imm0, 23); - __m256 pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} - -_PS256_CONST(minus_cephes_DP1, -0.78515625f); -_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); -_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); -_PS256_CONST(sincof_p0, -1.9515295891E-4f); -_PS256_CONST(sincof_p1, 8.3321608736E-3f); -_PS256_CONST(sincof_p2, -1.6666654611E-1f); -_PS256_CONST(coscof_p0, 2.443315711809948E-005f); -_PS256_CONST(coscof_p1, -1.388731625493765E-003f); -_PS256_CONST(coscof_p2, 4.166664568298827E-002f); -_PS256_CONST(cephes_FOPI, 1.27323954473516f); - -/* evaluation of 8 sines at onces using AVX intrisics - The code is the exact rewriting of the cephes sinf function. - Precision is excellent as long as x < 8192 (I did not bother to - take into account the special handling they have for greater values - -- it does not return garbage for arguments over 8192, though, but - the extra precision is missing). - Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - surprising but correct result. -*/ -static inline __m256 sin256_ps(__m256 x) -{ - - __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; - __m256i imm0, imm2; - -#ifndef __AVX2__ - __m128i imm0_1, imm0_2; - __m128i imm2_1, imm2_2; -#endif - - sign_bit = x; - /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps256_inv_sign_mask); - /* extract the sign bit (upper one) */ - sign_bit = _mm256_and_ps(sign_bit, *(__m256*)_ps256_sign_mask); - - /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(__m256*)_ps256_cephes_FOPI); - - /* - Here we start a series of integer operations, which are in the - realm of AVX2. - If we don't have AVX, let's perform them using SSE2 directives - */ - -#ifdef __AVX2__ - /* store the integer part of y in mm0 */ - imm2 = _mm256_cvttps_epi32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - - imm2 = _mm256_add_epi32(imm2, *(__m256i*)_pi32_256_1); - imm2 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_inv1); - y = _mm256_cvtepi32_ps(imm2); - - /* get the swap sign flag */ - imm0 = _mm256_and_si256(imm2, *(__m256i*)_pi32_256_4); - imm0 = _mm256_slli_epi32(imm0, 29); - /* get the polynom selection mask - there is one polynom for 0 <= x <= Pi/4 - and another one for Pi/4 - -#if (__ARM_FP & 2) -static inline float32x4_t loadfp16(const void* ptr) -{ -#if __ARM_FP16_FORMAT_IEEE - return vcvt_f32_f16(vld1_f16((const __fp16*)ptr)); -#else - - float32x4_t v; -#if __aarch64__ - asm volatile( - "ld1 {v0.4h}, [%2] \n" - "fcvtl %0.4s, v0.4h \n" - : "=w"(v) - - : "0"(v), - "r"(ptr) - - : "memory", "v0"); -#else - asm volatile( - "vld1.s16 {d0}, [%2] \n" - "vcvt.f32.f16 %q0, d0 \n" - : "=w"(v) - - : "0"(v), - "r"(ptr) - - : "memory", "d0"); -#endif - - return v; -#endif - -} -#endif - -#define c_inv_mant_mask ~0x7f800000u -#define c_cephes_SQRTHF 0.707106781186547524 -#define c_cephes_log_p0 7.0376836292E-2 -#define c_cephes_log_p1 -1.1514610310E-1 -#define c_cephes_log_p2 1.1676998740E-1 -#define c_cephes_log_p3 -1.2420140846E-1 -#define c_cephes_log_p4 +1.4249322787E-1 -#define c_cephes_log_p5 -1.6668057665E-1 -#define c_cephes_log_p6 +2.0000714765E-1 -#define c_cephes_log_p7 -2.4999993993E-1 -#define c_cephes_log_p8 +3.3333331174E-1 -#define c_cephes_log_q1 -2.12194440e-4 -#define c_cephes_log_q2 0.693359375 - -/* natural logarithm computed for 4 simultaneous float - * return NaN for x <= 0 - */ -static inline float32x4_t log_ps(float32x4_t x) -{ - float32x4_t one = vdupq_n_f32(1); - - x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ - uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); - - int32x4_t ux = vreinterpretq_s32_f32(x); - - int32x4_t emm0 = vshrq_n_s32(ux, 23); - - /* keep only the fractional part */ - ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); - ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); - x = vreinterpretq_f32_s32(ux); - - emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); - float32x4_t e = vcvtq_f32_s32(emm0); - - e = vaddq_f32(e, one); - - /* part2: - * if( x < SQRTHF ) { - * e -= 1; - * x = x + x - 1.0; - * } else { x = x - 1.0; } - */ - uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); - float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); - x = vsubq_f32(x, one); - e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); - x = vaddq_f32(x, tmp); - - float32x4_t z = vmulq_f32(x, x); - - float32x4_t y = vdupq_n_f32(c_cephes_log_p0); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); - y = vmulq_f32(y, x); - y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); - y = vmulq_f32(y, x); - - y = vmulq_f32(y, z); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); - y = vaddq_f32(y, tmp); - - tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); - y = vsubq_f32(y, tmp); - - tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); - x = vaddq_f32(x, y); - x = vaddq_f32(x, tmp); - x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); - - return x; -} - -#define c_exp_hi 88.3762626647949f -#define c_exp_lo -88.3762626647949f - -#define c_cephes_LOG2EF 1.44269504088896341 -#define c_cephes_exp_C1 0.693359375 -#define c_cephes_exp_C2 -2.12194440e-4 - -#define c_cephes_exp_p0 1.9875691500E-4 -#define c_cephes_exp_p1 1.3981999507E-3 -#define c_cephes_exp_p2 8.3334519073E-3 -#define c_cephes_exp_p3 4.1665795894E-2 -#define c_cephes_exp_p4 1.6666665459E-1 -#define c_cephes_exp_p5 5.0000001201E-1 - -/* exp() computed for 4 float at once */ -static inline float32x4_t exp_ps(float32x4_t x) -{ - float32x4_t tmp, fx; - - float32x4_t one = vdupq_n_f32(1); - x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); - x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); - - /* express exp(x) as exp(g + n*log(2)) */ - fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); - - /* perform a floorf */ - tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); - - /* if greater, substract 1 */ - uint32x4_t mask = vcgtq_f32(tmp, fx); - mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); - - fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); - - tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); - float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); - x = vsubq_f32(x, tmp); - x = vsubq_f32(x, z); - - static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5}; - float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0); - float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1); - float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2); - float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3); - float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4); - float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5); - - y = vmulq_f32(y, x); - z = vmulq_f32(x, x); - - y = vaddq_f32(y, c1); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c2); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c3); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c4); - y = vmulq_f32(y, x); - y = vaddq_f32(y, c5); - - y = vmulq_f32(y, z); - y = vaddq_f32(y, x); - y = vaddq_f32(y, one); - - /* build 2^n */ - int32x4_t mm; - mm = vcvtq_s32_f32(fx); - mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); - mm = vshlq_n_s32(mm, 23); - float32x4_t pow2n = vreinterpretq_f32_s32(mm); - - y = vmulq_f32(y, pow2n); - return y; -} - -#define c_minus_cephes_DP1 -0.78515625 -#define c_minus_cephes_DP2 -2.4187564849853515625e-4 -#define c_minus_cephes_DP3 -3.77489497744594108e-8 -#define c_sincof_p0 -1.9515295891E-4 -#define c_sincof_p1 8.3321608736E-3 -#define c_sincof_p2 -1.6666654611E-1 -#define c_coscof_p0 2.443315711809948E-005 -#define c_coscof_p1 -1.388731625493765E-003 -#define c_coscof_p2 4.166664568298827E-002 -#define c_cephes_FOPI 1.27323954473516 - -/* evaluation of 4 sines & cosines at once. - * - * The code is the exact rewriting of the cephes sinf function. - * Precision is excellent as long as x < 8192 (I did not bother to - * take into account the special handling they have for greater values - * -- it does not return garbage for arguments over 8192, though, but - * the extra precision is missing). - * - * Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the - * surprising but correct result. - * - * Note also that when you compute sin(x), cos(x) is available at - * almost no extra price so both sin_ps and cos_ps make use of - * sincos_ps.. - */ -static inline void sincos_ps(float32x4_t x, float32x4_t* ysin, float32x4_t* ycos) -{ - - float32x4_t xmm1, xmm2, xmm3, y; - - uint32x4_t emm2; - - uint32x4_t sign_mask_sin, sign_mask_cos; - sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); - x = vabsq_f32(x); - - /* scale by 4/Pi */ - y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); - - /* store the integer part of y in mm0 */ - emm2 = vcvtq_u32_f32(y); - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); - emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); - y = vcvtq_f32_u32(emm2); - - /* get the polynom selection mask - * there is one polynom for 0 <= x <= Pi/4 - * and another one for Pi/4