mirror of
https://sourceware.org/git/glibc.git
synced 2025-12-20 01:12:17 +08:00
AArch64: Implement exp2m1 and exp10m1 routines
Vector variants of the new C23 exp2m1 & exp10m1 routines. Note: Benchmark inputs for exp2m1 & exp10m1 are identical to exp2 & exp10 respectively, this also includes the floating point variations. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
committed by
Wilco Dijkstra
parent
54bd776f99
commit
8ced7815fb
4100
benchtests/libmvec/exp10m1-inputs
Normal file
4100
benchtests/libmvec/exp10m1-inputs
Normal file
File diff suppressed because it is too large
Load Diff
4100
benchtests/libmvec/exp10m1f-inputs
Normal file
4100
benchtests/libmvec/exp10m1f-inputs
Normal file
File diff suppressed because it is too large
Load Diff
4100
benchtests/libmvec/exp2m1-inputs
Normal file
4100
benchtests/libmvec/exp2m1-inputs
Normal file
File diff suppressed because it is too large
Load Diff
4100
benchtests/libmvec/exp2m1f-inputs
Normal file
4100
benchtests/libmvec/exp2m1f-inputs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -187,6 +187,28 @@
|
||||
#define __DECL_SIMD_expm1f64x
|
||||
#define __DECL_SIMD_expm1f128x
|
||||
|
||||
#define __DECL_SIMD_exp2m1
|
||||
#define __DECL_SIMD_exp2m1f
|
||||
#define __DECL_SIMD_exp2m1l
|
||||
#define __DECL_SIMD_exp2m1f16
|
||||
#define __DECL_SIMD_exp2m1f32
|
||||
#define __DECL_SIMD_exp2m1f64
|
||||
#define __DECL_SIMD_exp2m1f128
|
||||
#define __DECL_SIMD_exp2m1f32x
|
||||
#define __DECL_SIMD_exp2m1f64x
|
||||
#define __DECL_SIMD_exp2m1f128x
|
||||
|
||||
#define __DECL_SIMD_exp10m1
|
||||
#define __DECL_SIMD_exp10m1f
|
||||
#define __DECL_SIMD_exp10m1l
|
||||
#define __DECL_SIMD_exp10m1f16
|
||||
#define __DECL_SIMD_exp10m1f32
|
||||
#define __DECL_SIMD_exp10m1f64
|
||||
#define __DECL_SIMD_exp10m1f128
|
||||
#define __DECL_SIMD_exp10m1f32x
|
||||
#define __DECL_SIMD_exp10m1f64x
|
||||
#define __DECL_SIMD_exp10m1f128x
|
||||
|
||||
#define __DECL_SIMD_sinh
|
||||
#define __DECL_SIMD_sinhf
|
||||
#define __DECL_SIMD_sinhl
|
||||
|
||||
@@ -136,10 +136,10 @@ __MATHCALL (modf,, (_Mdouble_ __x, _Mdouble_ *__iptr)) __nonnull ((2));
|
||||
__MATHCALL_VEC (exp10,, (_Mdouble_ __x));
|
||||
|
||||
/* Return exp2(X) - 1. */
|
||||
__MATHCALL (exp2m1,, (_Mdouble_ __x));
|
||||
__MATHCALL_VEC (exp2m1,, (_Mdouble_ __x));
|
||||
|
||||
/* Return exp10(X) - 1. */
|
||||
__MATHCALL (exp10m1,, (_Mdouble_ __x));
|
||||
__MATHCALL_VEC (exp10m1,, (_Mdouble_ __x));
|
||||
|
||||
/* Return log2(1 + X). */
|
||||
__MATHCALL (log2p1,, (_Mdouble_ __x));
|
||||
|
||||
@@ -19,6 +19,8 @@ libmvec-supported-funcs = acos \
|
||||
exp10 \
|
||||
exp2 \
|
||||
expm1 \
|
||||
exp2m1 \
|
||||
exp10m1 \
|
||||
hypot \
|
||||
log \
|
||||
log10 \
|
||||
|
||||
@@ -179,4 +179,16 @@ libmvec {
|
||||
_ZGVsMxvv_atan2pi;
|
||||
_ZGVsMxvv_atan2pif;
|
||||
}
|
||||
GLIBC_2.43 {
|
||||
_ZGVnN2v_exp2m1;
|
||||
_ZGVnN2v_exp2m1f;
|
||||
_ZGVnN4v_exp2m1f;
|
||||
_ZGVsMxv_exp2m1;
|
||||
_ZGVsMxv_exp2m1f;
|
||||
_ZGVnN2v_exp10m1;
|
||||
_ZGVnN2v_exp10m1f;
|
||||
_ZGVnN4v_exp10m1f;
|
||||
_ZGVsMxv_exp10m1;
|
||||
_ZGVsMxv_exp10m1f;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,8 @@ libmvec_hidden_proto (V_NAME_F1(exp10));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp2));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp));
|
||||
libmvec_hidden_proto (V_NAME_F1(expm1));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp2m1));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp10m1));
|
||||
libmvec_hidden_proto (V_NAME_F2(hypot));
|
||||
libmvec_hidden_proto (V_NAME_F1(log10));
|
||||
libmvec_hidden_proto (V_NAME_F1(log1p));
|
||||
|
||||
@@ -113,6 +113,14 @@
|
||||
# define __DECL_SIMD_expm1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_expm1f
|
||||
# define __DECL_SIMD_expm1f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp2m1
|
||||
# define __DECL_SIMD_exp2m1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp2m1f
|
||||
# define __DECL_SIMD_exp2m1f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp10m1
|
||||
# define __DECL_SIMD_exp10m1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp10m1f
|
||||
# define __DECL_SIMD_exp10m1f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_hypot
|
||||
# define __DECL_SIMD_hypot __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_hypotf
|
||||
@@ -212,6 +220,8 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp2m1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp10m1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
|
||||
@@ -247,6 +257,8 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp2m1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp10m1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
|
||||
@@ -287,6 +299,8 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp2m1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp10m1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
|
||||
@@ -322,6 +336,8 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp2m1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp10m1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
|
||||
|
||||
202
sysdeps/aarch64/fpu/exp10m1_advsimd.c
Normal file
202
sysdeps/aarch64/fpu/exp10m1_advsimd.c
Normal file
@@ -0,0 +1,202 @@
|
||||
/* Double-precision (Advanced SIMD) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 163840.0 /* 1280.0 * N. */
|
||||
|
||||
/* Value of |x| below which scale - 1 contributes produces large error. */
|
||||
#define TableBound 0x1.a308a4198c9d7p-4 /* log10(2) * 87/256. */
|
||||
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
const static struct data
|
||||
{
|
||||
double c6, c0;
|
||||
double c2, c4;
|
||||
double log2_10_hi, log2_10_lo;
|
||||
float64x2_t c1, c3, c5;
|
||||
float64x2_t log10_2, shift;
|
||||
float64x2_t special_bound, scale_thresh;
|
||||
uint64x2_t sm1_tbl_off, sm1_tbl_mask;
|
||||
float64x2_t rnd2zero;
|
||||
uint64_t scalem1[88];
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm. */
|
||||
.c0 = 0x1.26bb1bbb55516p1,
|
||||
.c1 = V2 (0x1.53524c73cea69p1),
|
||||
.c2 = 0x1.0470591b30b6bp1,
|
||||
.c3 = V2 (0x1.2bd7609b57e25p0),
|
||||
.c4 = 0x1.1517d41bddcbep-1,
|
||||
.c5 = V2 (0x1.a9a12d6f0755cp-3),
|
||||
.c6 = -0x1.76a8d3abd7025p9,
|
||||
|
||||
/* Values of x which should round to the zeroth index of the exp10m1 table. */
|
||||
.rnd2zero = V2 (-0x1.34413509f79ffp-10), /* (2^-8)/log2(10). */
|
||||
.sm1_tbl_off = V2 (24),
|
||||
.sm1_tbl_mask = V2 (0x3f),
|
||||
|
||||
.log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */
|
||||
.log2_10_hi = 0x1.34413509f79ffp-9, /* log2(10)/N. */
|
||||
.log2_10_lo = -0x1.9dc1da994fd21p-66,
|
||||
.shift = V2 (0x1.8p+52),
|
||||
.scale_thresh = V2 (ScaleBound),
|
||||
.special_bound = V2 (SpecialBound),
|
||||
|
||||
/* Table containing 2^x - 1, for 2^x values close to 1.
|
||||
The table holds values of 2^(i/128) - 1, computed in
|
||||
arbitrary precision.
|
||||
The 1st half contains values associated to i=0..43.
|
||||
The 2nd half contains values associated to i=-44..-1. */
|
||||
.scalem1 = {
|
||||
0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
|
||||
0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
|
||||
0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
|
||||
0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
|
||||
0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
|
||||
0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
|
||||
0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
|
||||
0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
|
||||
0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
|
||||
0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
|
||||
0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
|
||||
0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
|
||||
0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
|
||||
0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
|
||||
0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
|
||||
0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
|
||||
0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
|
||||
0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
|
||||
0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
|
||||
0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
|
||||
0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
|
||||
0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
|
||||
0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
|
||||
0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
|
||||
0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
|
||||
0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
|
||||
0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
|
||||
0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
|
||||
0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
|
||||
0xbf761eea3847077b,
|
||||
}
|
||||
};
|
||||
|
||||
#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
|
||||
#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
|
||||
|
||||
static inline float64x2_t VPCS_ATTR
|
||||
special_case (float64x2_t s, float64x2_t y, float64x2_t n,
|
||||
const struct data *d)
|
||||
{
|
||||
/* 2^(n/N) may overflow, break it up into s1*s2. */
|
||||
uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
|
||||
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
|
||||
float64x2_t s2 = vreinterpretq_f64_u64 (
|
||||
vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
|
||||
uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh);
|
||||
float64x2_t r1 = vmulq_f64 (s1, s1);
|
||||
float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
|
||||
return vsubq_f64 (vbslq_f64 (cmp, r1, r0), v_f64 (1.0));
|
||||
}
|
||||
|
||||
static inline uint64x2_t
|
||||
lookup_sbits (uint64x2_t i)
|
||||
{
|
||||
return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
|
||||
__v_exp_data[i[1] & IndexMask] };
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
|
||||
{
|
||||
/* Extract sign bit and use as offset into table. */
|
||||
uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
|
||||
uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
|
||||
uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
|
||||
uint64x2_t idx = vaddq_u64 (base_idx, offset);
|
||||
|
||||
uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
|
||||
return vreinterpretq_f64_u64 (sm1);
|
||||
}
|
||||
|
||||
/* Fast vector implementation of exp10m1.
|
||||
Maximum measured error is 2.53 + 0.5 ulp.
|
||||
_ZGVnN2v_exp10m1 (0x1.347007ffed62cp-10) got 0x1.63955944d1d83p-9
|
||||
want 0x1.63955944d1d80p-9. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (exp10m1) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint64x2_t cmp = vcageq_f64 (x, d->special_bound);
|
||||
|
||||
/* n = round(x/(log10(2)/N)). */
|
||||
float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (z);
|
||||
float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
|
||||
float64x2_t c24 = vld1q_f64 (&d->c2);
|
||||
float64x2_t c60 = vld1q_f64 (&d->c6);
|
||||
float64x2_t log_2_10 = vld1q_f64 (&d->log2_10_hi);
|
||||
|
||||
/* r = x - n*log10(2)/N. */
|
||||
float64x2_t r = x;
|
||||
r = vfmsq_laneq_f64 (r, n, log_2_10, 0);
|
||||
r = vfmsq_laneq_f64 (r, n, log_2_10, 1);
|
||||
|
||||
/* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, c24, 0);
|
||||
float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c24, 1);
|
||||
float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c60, 0);
|
||||
|
||||
float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
|
||||
float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
|
||||
|
||||
float64x2_t p0 = vmulq_laneq_f64 (r, c60, 1);
|
||||
float64x2_t p = vfmaq_f64 (p0, r2, p16);
|
||||
|
||||
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
|
||||
|
||||
/* scale = 2^(n/N). */
|
||||
uint64x2_t scale_bits = lookup_sbits (u);
|
||||
float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
|
||||
float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
|
||||
|
||||
/* Use table to gather scalem1 for small values of x. */
|
||||
uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
|
||||
if (v_any_u64 (is_small))
|
||||
scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
|
||||
|
||||
/* Construct exp10m1 = (scale - 1) + scale * poly. */
|
||||
float64x2_t y = vfmaq_f64 (scalem1, scale, p);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (v_any_u64 (cmp)))
|
||||
return vbslq_f64 (cmp, special_case (scale, p, n, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
184
sysdeps/aarch64/fpu/exp10m1_sve.c
Normal file
184
sysdeps/aarch64/fpu/exp10m1_sve.c
Normal file
@@ -0,0 +1,184 @@
|
||||
/* Double-precision (SVE) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 0x1.33f4bedd4fa70p+8 /* log10(2^(1023 + 1/128)). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 1280.0
|
||||
|
||||
/* Value of |x| below which scale - 1 contributes produces large error. */
|
||||
#define FexpaBound 0x1.2a9f2b61a7e2p-4 /* 31*(log10(2)/128). */
|
||||
|
||||
static const struct data
|
||||
{
|
||||
double log2_10_hi, log2_10_lo;
|
||||
double c3, c5;
|
||||
double c0, c1, c2, c4;
|
||||
double shift, log10_2, special_bound;
|
||||
uint64_t scalem1[32];
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm. */
|
||||
.c0 = 0x1.26bb1bbb55516p1,
|
||||
.c1 = 0x1.53524c73cea6ap1,
|
||||
.c2 = 0x1.0470591d8bd2ep1,
|
||||
.c3 = 0x1.2bd7609dfea43p0,
|
||||
.c4 = 0x1.142d0d89058f1p-1,
|
||||
.c5 = 0x1.a80cf2ddd513p-3,
|
||||
|
||||
/* 1.5*2^46+1023. This value is further explained below. */
|
||||
.shift = 0x1.800000000ffc0p+46,
|
||||
.log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
|
||||
.log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */
|
||||
.log2_10_lo = -0x1.9dc1da994fd21p-59,
|
||||
.special_bound = SpecialBound,
|
||||
|
||||
/* Table emulating FEXPA - 1, for values of FEXPA close to 1.
|
||||
The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
|
||||
The 1st half contains values associated to i=0..+15.
|
||||
The 2nd half contains values associated to i=0..-15. */
|
||||
.scalem1 = {
|
||||
0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f,
|
||||
0x3fa0e8a30eb37901, 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440,
|
||||
0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, 0x3fb72b83c7d517ae,
|
||||
0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
|
||||
0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709,
|
||||
0x3fc6942d3720185a, 0x0000000000000000, 0xbfc331751ec3a814,
|
||||
0xbfc20224341286e4, 0xbfc0cf85bed0f8b7, 0xbfbf332113d56b1f,
|
||||
0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
|
||||
0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a,
|
||||
0xbfaafd11874c009e, 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89,
|
||||
0xbf95f134923757f3, 0xbf860f9f985bc9f4,
|
||||
},
|
||||
};
|
||||
|
||||
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
|
||||
#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
|
||||
|
||||
static NOINLINE svfloat64_t
|
||||
special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
|
||||
svfloat64_t n)
|
||||
{
|
||||
/* s=2^n may overflow, break it up into s=s1*s2,
|
||||
such that exp = s + s*y can be computed as s1*(s2+s2*y)
|
||||
and s1*s1 overflows only if n>0. */
|
||||
|
||||
/* If n<=0 then set b to 0x6, 0 otherwise. */
|
||||
svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
|
||||
svuint64_t b
|
||||
= svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
|
||||
|
||||
/* Set s1 to generate overflow depending on sign of exponent n,
|
||||
ie. s1 = 0x70...0 - b. */
|
||||
svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
|
||||
/* Offset s to avoid overflow in final result if n is below threshold.
|
||||
ie. s2 = as_u64 (s) - 0x3010...0 + b. */
|
||||
svfloat64_t s2 = svreinterpret_f64 (
|
||||
svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
|
||||
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
svbool_t p_cmp = svacgt (pg, n, ScaleBound);
|
||||
|
||||
svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (pg, s2, s2, p);
|
||||
svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */
|
||||
return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
|
||||
}
|
||||
|
||||
/* FEXPA based SVE exp10m1 algorithm.
|
||||
Maximum measured error is 2.87 + 0.5 ULP:
|
||||
_ZGVsMxv_exp10m1(0x1.64645f11e94c6p-4) got 0x1.c64d54eb7658dp-3
|
||||
want 0x1.c64d54eb7658ap-3. */
|
||||
svfloat64_t SV_NAME_D1 (exp10m1) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
svbool_t special = svacgt (pg, x, d->special_bound);
|
||||
|
||||
/* n = round(x/(log10(2)/N)). */
|
||||
svfloat64_t shift = sv_f64 (d->shift);
|
||||
svfloat64_t z = svmla_x (pg, shift, x, d->log10_2);
|
||||
svfloat64_t n = svsub_x (pg, z, shift);
|
||||
|
||||
/* r = x - n*log10(2)/N. */
|
||||
svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi);
|
||||
svfloat64_t r = x;
|
||||
r = svmls_lane (r, n, log2_10, 0);
|
||||
r = svmls_lane (r, n, log2_10, 1);
|
||||
|
||||
/* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so
|
||||
for consistent NaN handling we have to manually propagate them. This
|
||||
comes at significant performance cost. */
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svfloat64_t scale = svexpa (u);
|
||||
svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c3);
|
||||
/* Approximate exp10(r) using polynomial. */
|
||||
svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, sv_f64 (d->c1));
|
||||
svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c24, 0);
|
||||
svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c24, 1);
|
||||
svfloat64_t p25 = svmla_x (pg, p23, p45, r2);
|
||||
svfloat64_t p05 = svmla_x (pg, p01, p25, r2);
|
||||
|
||||
svfloat64_t p = svmul_x (pg, p05, r);
|
||||
|
||||
svfloat64_t scalem1 = svsub_x (pg, scale, 1.0);
|
||||
|
||||
/* For small values, use a lookup table for a more accurate scalem1. */
|
||||
svbool_t is_small = svaclt (pg, x, FexpaBound);
|
||||
if (svptest_any (pg, is_small))
|
||||
{
|
||||
/* Use the low 4 bits of the input of FEXPA as index. */
|
||||
svuint64_t base_idx = svand_x (pg, u, 0xf);
|
||||
|
||||
/* We can use the sign of x as a fifth bit to account for the asymmetry
|
||||
of e^x around 0. */
|
||||
svuint64_t signBit
|
||||
= svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
|
||||
svuint64_t idx = svorr_x (pg, base_idx, signBit);
|
||||
|
||||
/* Lookup values for scale - 1 for small x. */
|
||||
svfloat64_t lookup
|
||||
= svreinterpret_f64 (svld1_gather_index (is_small, d->scalem1, idx));
|
||||
|
||||
/* Select the appropriate scale - 1 value based on x. */
|
||||
scalem1 = svsel (is_small, lookup, scalem1);
|
||||
}
|
||||
|
||||
svfloat64_t y = svmla_x (pg, scalem1, scale, p);
|
||||
|
||||
/* FEXPA returns nan for large inputs so we special case those. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
{
|
||||
/* FEXPA zeroes the sign bit, however the sign is meaningful to the
|
||||
special case function so needs to be copied.
|
||||
e = sign bit of u << 46. */
|
||||
svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
|
||||
/* Copy sign to scale. */
|
||||
scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
|
||||
return special_case (pg, y, scale, p, n);
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
120
sysdeps/aarch64/fpu/exp10m1f_advsimd.c
Normal file
120
sysdeps/aarch64/fpu/exp10m1f_advsimd.c
Normal file
@@ -0,0 +1,120 @@
|
||||
/* Single-precision (Advanced SIMD) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float log10_2_high, log10_2_low;
|
||||
float log10_lo, c2, c4, c6;
|
||||
float32x4_t log10_hi, c1, c3, c5, c7, c8;
|
||||
float32x4_t inv_log10_2, special_bound;
|
||||
uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
float32x4_t scale_thresh;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm with minimisation of relative
|
||||
error. */
|
||||
.log10_hi = V4 (0x1.26bb1b8000000p+1),
|
||||
.log10_lo = 0x1.daaa8b0000000p-26,
|
||||
.c1 = V4 (0x1.53524ep1),
|
||||
.c2 = 0x1.046fc8p1,
|
||||
.c3 = V4 (0x1.2bd376p0),
|
||||
.c4 = 0x1.156f8p-1,
|
||||
.c5 = V4 (0x1.b28c0ep-3),
|
||||
.c6 = -0x1.05e38ep-4,
|
||||
.c7 = V4 (-0x1.c79f4ap-4),
|
||||
.c8 = V4 (0x1.2d6f34p1),
|
||||
.inv_log10_2 = V4 (0x1.a934fp+1),
|
||||
.log10_2_high = 0x1.344136p-2,
|
||||
.log10_2_low = 0x1.ec10cp-27,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
.scale_thresh = V4 (ScaleBound),
|
||||
.special_bound = V4 (SpecialBound),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
|
||||
return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
|
||||
}
|
||||
|
||||
/* Fast vector implementation of single-precision exp10m1.
|
||||
Algorithm is accurate to 1.70 + 0.5 ULP.
|
||||
_ZGVnN4v_exp10m1f(0x1.36f94cp-3) got 0x1.ac96acp-2
|
||||
want 0x1.ac96bp-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10m1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
|
||||
float32x4_t log10_2 = vld1q_f32 (&d->log10_2_high);
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2, 0);
|
||||
r = vfmaq_laneq_f32 (r, n, log10_2, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
|
||||
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
float32x4_t log10lo_c246 = vld1q_f32 (&d->log10_lo);
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p78 = vfmaq_f32 (d->c7, r, d->c8);
|
||||
float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log10lo_c246, 3);
|
||||
float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10lo_c246, 2);
|
||||
float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10lo_c246, 1);
|
||||
float32x4_t p58 = vfmaq_f32 (p56, r2, p78);
|
||||
float32x4_t p36 = vfmaq_f32 (p34, r2, p58);
|
||||
float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
|
||||
|
||||
float32x4_t poly
|
||||
= vfmaq_laneq_f32 (vmulq_f32 (d->log10_hi, r), r, log10lo_c246, 0);
|
||||
poly = vfmaq_f32 (poly, p16, r2);
|
||||
|
||||
float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (v_any_u32 (cmp)))
|
||||
return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (exp10m1))
|
||||
HALF_WIDTH_ALIAS_F1 (exp10m1)
|
||||
122
sysdeps/aarch64/fpu/exp10m1f_sve.c
Normal file
122
sysdeps/aarch64/fpu/exp10m1f_sve.c
Normal file
@@ -0,0 +1,122 @@
|
||||
/* Single-precision (SVE) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float log10_2_high, log10_2_low;
|
||||
float log10_lo, c2, c4, c6, c8;
|
||||
float32_t log10_hi, c1, c3, c5, c7;
|
||||
float32_t inv_log10_2, special_bound;
|
||||
uint32_t exponent_bias, special_offset, special_bias;
|
||||
float32_t scale_thresh;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm with minimisation of relative
|
||||
error. */
|
||||
.log10_hi = 0x1.26bb1b8000000p+1,
|
||||
.log10_lo = 0x1.daaa8b0000000p-26,
|
||||
.c1 = 0x1.53524ep1,
|
||||
.c2 = 0x1.046fc8p1,
|
||||
.c3 = 0x1.2bd376p0,
|
||||
.c4 = 0x1.156f8p-1,
|
||||
.c5 = 0x1.b28c0ep-3,
|
||||
.c6 = -0x1.05e38ep-4,
|
||||
.c7 = -0x1.c79f4ap-4,
|
||||
.c8 = 0x1.2d6f34p1,
|
||||
.inv_log10_2 = 0x1.a934fp+1,
|
||||
.log10_2_high = 0x1.344136p-2,
|
||||
.log10_2_low = 0x1.ec10cp-27,
|
||||
.exponent_bias = 0x3f800000,
|
||||
.special_offset = 0x82000000,
|
||||
.special_bias = 0x7f000000,
|
||||
.scale_thresh = ScaleBound,
|
||||
.special_bound = SpecialBound,
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
|
||||
svfloat32_t scale, const struct data *d)
|
||||
{
|
||||
svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
|
||||
svfloat32_t s1 = svreinterpret_f32 (
|
||||
svsel (b, sv_u32 (d->special_offset + d->special_bias),
|
||||
sv_u32 (d->special_bias)));
|
||||
svfloat32_t s2
|
||||
= svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
|
||||
svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
|
||||
svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
|
||||
svfloat32_t r1
|
||||
= svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
|
||||
svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
|
||||
svfloat32_t r = svsel (cmp1, r1, r0);
|
||||
return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
|
||||
}
|
||||
|
||||
/* Fast vector implementation of single-precision exp10.
|
||||
Algorithm is accurate to 1.68 + 0.5 ULP.
|
||||
_ZGVnN4v_exp10m1f(0x1.3aeffep-3) got 0x1.b3139p-2
|
||||
want 0x1.b3138cp-2. */
|
||||
svfloat32_t SV_NAME_F1 (exp10m1) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
|
||||
svfloat32_t log10_2 = svld1rq (svptrue_b32 (), &d->log10_2_high);
|
||||
svfloat32_t n = svrinta_x (pg, svmul_x (pg, x, d->inv_log10_2));
|
||||
svfloat32_t r = svmls_lane_f32 (x, n, log10_2, 0);
|
||||
r = svmla_lane_f32 (r, n, log10_2, 1);
|
||||
|
||||
svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
|
||||
|
||||
svfloat32_t scale
|
||||
= svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
|
||||
svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
svfloat32_t c2468 = svld1rq (svptrue_b32 (), &d->c2);
|
||||
svfloat32_t p78 = svmla_lane (sv_f32 (d->c7), r, c2468, 3);
|
||||
svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, c2468, 2);
|
||||
svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, c2468, 1);
|
||||
svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, c2468, 0);
|
||||
svfloat32_t p58 = svmla_x (pg, p56, r2, p78);
|
||||
svfloat32_t p36 = svmla_x (pg, p34, r2, p58);
|
||||
svfloat32_t p16 = svmla_x (pg, p12, r2, p36);
|
||||
|
||||
svfloat32_t poly = svmla_n_f32_x (pg, svmul_x (pg, r, sv_f32 (d->log10_hi)),
|
||||
r, d->log10_lo);
|
||||
poly = svmla_x (pg, poly, p16, r2);
|
||||
|
||||
svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
194
sysdeps/aarch64/fpu/exp2m1_advsimd.c
Normal file
194
sysdeps/aarch64/fpu/exp2m1_advsimd.c
Normal file
@@ -0,0 +1,194 @@
|
||||
/* Double-precision (Advanced SIMD) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 1022.0
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 1280.0
|
||||
|
||||
/* 87/256, value of x under which table lookup is used for 2^x-1. */
|
||||
#define TableBound 0x1.5bfffffffffffp-2
|
||||
|
||||
/* Number of bits for each value in the table. */
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint64x2_t exponent_bias, special_offset, special_bias, special_bias2,
|
||||
sm1_tbl_off, sm1_tbl_mask;
|
||||
float64x2_t scale_thresh, special_bound, shift, rnd2zero;
|
||||
float64x2_t log2_hi, c1, c3, c5;
|
||||
double log2_lo, c2, c4, c6;
|
||||
uint64_t scalem1[88];
|
||||
} data = {
|
||||
/* Coefficients generated using remez's algorithm for exp2m1(x). */
|
||||
.log2_hi = V2 (0x1.62e42fefa39efp-1),
|
||||
.log2_lo = 0x1.abc9e3b39803f3p-56,
|
||||
.c1 = V2 (0x1.ebfbdff82c58ep-3),
|
||||
.c2 = 0x1.c6b08d71f5804p-5,
|
||||
.c3 = V2 (0x1.3b2ab6fee7509p-7),
|
||||
.c4 = 0x1.5d1d37eb33b15p-10,
|
||||
.c5 = V2 (0x1.423f35f371d9ap-13),
|
||||
.c6 = 0x1.e7d57ad9a5f93p-5,
|
||||
.exponent_bias = V2 (0x3ff0000000000000),
|
||||
.special_offset = V2 (0x6000000000000000), /* 0x1p513. */
|
||||
.special_bias = V2 (0x7000000000000000), /* 0x1p769. */
|
||||
.special_bias2 = V2 (0x3010000000000000), /* 0x1p-254. */
|
||||
.scale_thresh = V2 (ScaleBound),
|
||||
.special_bound = V2 (SpecialBound),
|
||||
.shift = V2 (0x1.8p52 / N),
|
||||
.rnd2zero = V2 (-0x1p-8),
|
||||
.sm1_tbl_off = V2 (24),
|
||||
.sm1_tbl_mask = V2 (0x3f),
|
||||
|
||||
/* Table containing 2^x - 1, for 2^x values close to 1.
|
||||
The table holds values of 2^(i/128) - 1, computed in
|
||||
arbitrary precision.
|
||||
The 1st half contains values associated to i=0..43.
|
||||
The 2nd half contains values associated to i=-44..-1. */
|
||||
.scalem1 = {
|
||||
0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
|
||||
0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
|
||||
0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
|
||||
0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
|
||||
0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
|
||||
0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
|
||||
0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
|
||||
0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
|
||||
0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
|
||||
0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
|
||||
0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
|
||||
0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
|
||||
0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
|
||||
0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
|
||||
0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
|
||||
0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
|
||||
0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
|
||||
0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
|
||||
0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
|
||||
0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
|
||||
0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
|
||||
0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
|
||||
0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
|
||||
0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
|
||||
0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
|
||||
0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
|
||||
0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
|
||||
0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
|
||||
0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
|
||||
0xbf761eea3847077b,
|
||||
}
|
||||
};
|
||||
|
||||
static inline uint64x2_t
|
||||
lookup_sbits (uint64x2_t i)
|
||||
{
|
||||
return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
|
||||
__v_exp_data[i[1] & IndexMask] };
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
|
||||
{
|
||||
/* Extract sign bit and use as offset into table. */
|
||||
uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
|
||||
uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
|
||||
uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
|
||||
uint64x2_t idx = vaddq_u64 (base_idx, offset);
|
||||
|
||||
/* Fall back to table lookup for 2^x - 1, when x is close to zero to
|
||||
avoid large errors. */
|
||||
uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
|
||||
return vreinterpretq_f64_u64 (sm1);
|
||||
}
|
||||
|
||||
static inline VPCS_ATTR float64x2_t
|
||||
special_case (float64x2_t poly, float64x2_t n, uint64x2_t e, float64x2_t scale,
|
||||
const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint64x2_t b = vandq_u64 (vclezq_f64 (n), d->special_offset);
|
||||
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (d->special_bias, b));
|
||||
float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
|
||||
vsubq_u64 (vreinterpretq_u64_f64 (scale), d->special_bias2), b));
|
||||
uint64x2_t cmp2 = vcagtq_f64 (n, d->scale_thresh);
|
||||
float64x2_t r1 = vmulq_f64 (s1, s1);
|
||||
float64x2_t r2 = vmulq_f64 (vfmaq_f64 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
return vsubq_f64 (vbslq_f64 (cmp2, r1, r2), v_f64 (1.0f));
|
||||
}
|
||||
|
||||
/* Double-precision vector exp2(x) - 1 function.
|
||||
The maximum error is 2.55 + 0.5 ULP.
|
||||
_ZGVnN2v_exp2m1 (0x1.1113e87a035ap-8) got 0x1.7b1d06f0a7d36p-9
|
||||
want 0x1.7b1d06f0a7d33p-9. */
|
||||
VPCS_ATTR float64x2_t V_NAME_D1 (exp2m1) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp2(x) = 2^n (1 + poly(r))
|
||||
x = n + r, with r in [-1/2N, 1/2N].
|
||||
n is a floating point number, multiple of 1/N. */
|
||||
float64x2_t z = vaddq_f64 (d->shift, x);
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (z);
|
||||
float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
|
||||
/* Calculate scale, 2^n. */
|
||||
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
|
||||
uint64x2_t scale_bits = lookup_sbits (u);
|
||||
float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
|
||||
|
||||
uint64x2_t cmp = vcagtq_f64 (x, d->special_bound);
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
float64x2_t r = vsubq_f64 (x, n);
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
|
||||
float64x2_t log2lo_c2 = vld1q_f64 (&d->log2_lo);
|
||||
float64x2_t c4c6 = vld1q_f64 (&d->c4);
|
||||
|
||||
float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c4c6, 1);
|
||||
float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c4c6, 0);
|
||||
float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, log2lo_c2, 1);
|
||||
float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
|
||||
float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
|
||||
float64x2_t poly
|
||||
= vfmaq_laneq_f64 (vmulq_f64 (d->log2_hi, r), r, log2lo_c2, 0);
|
||||
poly = vfmaq_f64 (poly, p16, r2);
|
||||
|
||||
float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
|
||||
|
||||
/* Use table to gather scalem1 for small values of x. */
|
||||
uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
|
||||
if (v_any_u64 (is_small))
|
||||
scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
|
||||
|
||||
/* Construct exp2m1 = (scale - 1) + scale * poly. */
|
||||
float64x2_t y = vfmaq_f64 (scalem1, poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (v_any_u64 (cmp)))
|
||||
return vbslq_f64 (cmp, special_case (poly, n, e, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
197
sysdeps/aarch64/fpu/exp2m1_sve.c
Normal file
197
sysdeps/aarch64/fpu/exp2m1_sve.c
Normal file
@@ -0,0 +1,197 @@
|
||||
/* Double-precision (SVE) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 1022.0
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 1280.0
|
||||
|
||||
/* 87/256, value of x under which table lookup is used for 2^x-1. */
|
||||
#define TableBound 0x1.5bfffffffffffp-2
|
||||
|
||||
/* Number of bits for each value in the table. */
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
static const struct data
|
||||
{
|
||||
double shift, rnd2zero;
|
||||
double c1, c3, c5;
|
||||
double c0, c2, c4;
|
||||
uint64_t sm1_tbl_mask, sm1_tbl_off;
|
||||
uint64_t scalem1[88];
|
||||
} data = {
|
||||
/* Generated using fpminimax. */
|
||||
.c0 = 0x1.62e42fefa39efp-1,
|
||||
.c1 = 0x1.ebfbdff82c58ep-3,
|
||||
.c2 = 0x1.c6b08d707e662p-5,
|
||||
.c3 = 0x1.3b2ab6fc45f33p-7,
|
||||
.c4 = 0x1.5d86c0ff8618dp-10,
|
||||
.c5 = 0x1.4301374d5d2f5p-13,
|
||||
.shift = 0x1.8p52 / N,
|
||||
.sm1_tbl_mask = 0x3f,
|
||||
.sm1_tbl_off = 24,
|
||||
.rnd2zero = -0x1p-8,
|
||||
/* Table containing 2^x - 1, for 2^x values close to 1.
|
||||
The table holds values of 2^(i/128) - 1, computed in
|
||||
arbitrary precision.
|
||||
The 1st half contains values associated to i=0..43.
|
||||
The 2nd half contains values associated to i=-44..-1. */
|
||||
.scalem1= {
|
||||
0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
|
||||
0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
|
||||
0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
|
||||
0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
|
||||
0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
|
||||
0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
|
||||
0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
|
||||
0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
|
||||
0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
|
||||
0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
|
||||
0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
|
||||
0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
|
||||
0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
|
||||
0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
|
||||
0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
|
||||
0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
|
||||
0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
|
||||
0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
|
||||
0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
|
||||
0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
|
||||
0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
|
||||
0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
|
||||
0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
|
||||
0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
|
||||
0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
|
||||
0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
|
||||
0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
|
||||
0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
|
||||
0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
|
||||
0xbf761eea3847077b,
|
||||
}
|
||||
};
|
||||
|
||||
static inline svuint64_t
|
||||
lookup_sbits (svbool_t pg, svuint64_t indices)
|
||||
{
|
||||
/* Mask indices to valid range. */
|
||||
svuint64_t masked = svand_z (pg, indices, svdup_n_u64 (IndexMask));
|
||||
return svld1_gather_index (pg, __v_exp_data, masked);
|
||||
}
|
||||
|
||||
static inline svfloat64_t
|
||||
lookup_sm1bits (svbool_t pg, svfloat64_t x, svuint64_t u, const struct data *d)
|
||||
{
|
||||
/* Extract sign bit and use as offset into table. */
|
||||
svbool_t signBit = svcmplt_f64 (pg, x, sv_f64 (d->rnd2zero));
|
||||
svuint64_t base_idx = svand_x (pg, u, d->sm1_tbl_mask);
|
||||
svuint64_t idx = svadd_m (signBit, base_idx, sv_u64 (d->sm1_tbl_off));
|
||||
|
||||
/* Fall back to table lookup for 2^x - 1, when x is close to zero to
|
||||
avoid large errors. */
|
||||
svuint64_t sm1 = svld1_gather_index (svptrue_b64 (), d->scalem1, idx);
|
||||
return svreinterpret_f64 (sm1);
|
||||
}
|
||||
|
||||
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
|
||||
#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
|
||||
|
||||
static inline svfloat64_t
|
||||
special_case (svfloat64_t s, svfloat64_t y, svfloat64_t n,
|
||||
const struct data *d)
|
||||
{
|
||||
/* s=2^n may overflow, break it up into s=s1*s2,
|
||||
such that exp = s + s*y can be computed as s1*(s2+s2*y)
|
||||
and s1*s1 overflows only if n>0. */
|
||||
svbool_t p_sign = svcmple (svptrue_b64 (), n, 0.0); /* n <= 0. */
|
||||
svuint64_t b = svdup_u64_z (p_sign, SpecialOffset);
|
||||
|
||||
/* Set s1 to generate overflow depending on sign of exponent n. */
|
||||
svfloat64_t s1
|
||||
= svreinterpret_f64 (svsubr_x (svptrue_b64 (), b, SpecialBias1));
|
||||
/* Offset s to avoid overflow in final result if n is below threshold. */
|
||||
svfloat64_t s2 = svreinterpret_f64 (svadd_x (
|
||||
svptrue_b64 (),
|
||||
svsub_x (svptrue_b64 (), svreinterpret_u64 (s), SpecialBias2), b));
|
||||
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
svbool_t p_cmp = svacle (svptrue_b64 (), n, ScaleBound);
|
||||
|
||||
svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (svptrue_b64 (), s2, s2, y);
|
||||
svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
return svsub_x (svptrue_b64 (), svsel (p_cmp, r0, r1), 1.0);
|
||||
}
|
||||
|
||||
/* Double-precision SVE exp2(x) - 1.
|
||||
Maximum error is 2.58 + 0.5 ULP.
|
||||
_ZGVsMxv_exp2m1(0x1.0284a345c99bfp-8) got 0x1.66df630cd2965p-9
|
||||
want 0x1.66df630cd2962p-9. */
|
||||
svfloat64_t SV_NAME_D1 (exp2m1) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
/* exp2(x) = 2^n (1 + poly(r))
|
||||
x = n + r, with r in [-1/2N, 1/2N].
|
||||
n is a floating point number, multiple of 1/N. */
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
svbool_t special = svacge (pg, x, SpecialBound);
|
||||
|
||||
svfloat64_t z = svadd_x (pg, x, d->shift);
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
|
||||
svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
|
||||
/* Look up table to calculate 2^n. */
|
||||
svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TABLE_BITS);
|
||||
svuint64_t scale_bits = lookup_sbits (pg, u);
|
||||
svfloat64_t scale = svreinterpret_f64_u64 (svadd_x (pg, scale_bits, e));
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
svfloat64_t c35 = svld1rq (svptrue_b64 (), &d->c3);
|
||||
|
||||
svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, d->c1);
|
||||
svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c35, 0);
|
||||
svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c35, 1);
|
||||
|
||||
svfloat64_t p25 = svmla_x (pg, p23, r2, p45);
|
||||
svfloat64_t p05 = svmla_x (pg, p01, r2, p25);
|
||||
svfloat64_t poly = svmul_x (pg, p05, r);
|
||||
|
||||
/* Use table to gather scalem1 for small values of x. */
|
||||
svbool_t is_small = svaclt (pg, x, TableBound);
|
||||
svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
|
||||
if (svptest_any (pg, is_small))
|
||||
scalem1 = svsel_f64 (is_small, lookup_sm1bits (pg, x, u, d), scalem1);
|
||||
|
||||
/* Construct exp2m1 = (scale - 1) + scale * poly. */
|
||||
svfloat64_t y = svmla_x (pg, scalem1, scale, poly);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
return svsel_f64 (special, special_case (scale, poly, n, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
106
sysdeps/aarch64/fpu/exp2m1f_advsimd.c
Normal file
106
sysdeps/aarch64/fpu/exp2m1f_advsimd.c
Normal file
@@ -0,0 +1,106 @@
|
||||
/* Single-precision (Advanced SIMD) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
float32x4_t scale_thresh, special_bound;
|
||||
float32x4_t log2_hi, c1, c3, c5;
|
||||
float log2_lo, c2, c4, c6;
|
||||
} data = {
|
||||
/* Coefficients generated using remez's algorithm for exp2m1f(x). */
|
||||
.log2_hi = V4 (0x1.62e43p-1),
|
||||
.log2_lo = -0x1.05c610p-29,
|
||||
.c1 = V4 (0x1.ebfbep-3),
|
||||
.c2 = 0x1.c6b06ep-5,
|
||||
.c3 = V4 (0x1.3b2a5cp-7),
|
||||
.c4 = 0x1.5da59ep-10,
|
||||
.c5 = V4 (0x1.440dccp-13),
|
||||
.c6 = 0x1.e081d6p-17,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
.scale_thresh = V4 (ScaleBound),
|
||||
.special_bound = V4 (SpecialBound),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
|
||||
return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
|
||||
}
|
||||
|
||||
/* Single-precision vector exp2(x) - 1 function.
|
||||
The maximum error is 1.76 + 0.5 ULP.
|
||||
_ZGVnN4v_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
|
||||
want 0x1.ab2ecp-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2m1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
float32x4_t n = vrndaq_f32 (x);
|
||||
float32x4_t r = vsubq_f32 (x, n);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
|
||||
float32x4_t log2lo_c246 = vld1q_f32 (&d->log2_lo);
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
|
||||
/* Pairwise horner scheme. */
|
||||
float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log2lo_c246, 3);
|
||||
float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log2lo_c246, 2);
|
||||
float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log2lo_c246, 1);
|
||||
float32x4_t p36 = vfmaq_f32 (p34, r2, p56);
|
||||
float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
|
||||
float32x4_t poly
|
||||
= vfmaq_laneq_f32 (vmulq_f32 (d->log2_hi, r), r, log2lo_c246, 0);
|
||||
poly = vfmaq_f32 (poly, p16, r2);
|
||||
|
||||
float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (cmp)))
|
||||
return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (exp2m1))
|
||||
HALF_WIDTH_ALIAS_F1 (exp2m1)
|
||||
108
sysdeps/aarch64/fpu/exp2m1f_sve.c
Normal file
108
sysdeps/aarch64/fpu/exp2m1f_sve.c
Normal file
@@ -0,0 +1,108 @@
|
||||
/* Single-precision (SVE) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32_t exponent_bias, special_offset, special_bias;
|
||||
float32_t scale_thresh, special_bound;
|
||||
float log2_lo, c2, c4, c6;
|
||||
float log2_hi, c1, c3, c5, shift;
|
||||
} data = {
|
||||
/* Coefficients generated using remez's algorithm for exp2m1f(x). */
|
||||
.log2_hi = 0x1.62e43p-1,
|
||||
.log2_lo = -0x1.05c610p-29,
|
||||
.c1 = 0x1.ebfbep-3,
|
||||
.c2 = 0x1.c6b06ep-5,
|
||||
.c3 = 0x1.3b2a5cp-7,
|
||||
.c4 = 0x1.5da59ep-10,
|
||||
.c5 = 0x1.440dccp-13,
|
||||
.c6 = 0x1.e081d6p-17,
|
||||
.exponent_bias = 0x3f800000,
|
||||
.special_offset = 0x82000000,
|
||||
.special_bias = 0x7f000000,
|
||||
.scale_thresh = ScaleBound,
|
||||
.special_bound = SpecialBound,
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
|
||||
svfloat32_t scale, const struct data *d)
|
||||
{
|
||||
svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
|
||||
svfloat32_t s1 = svreinterpret_f32 (
|
||||
svsel (b, sv_u32 (d->special_offset + d->special_bias),
|
||||
sv_u32 (d->special_bias)));
|
||||
svfloat32_t s2
|
||||
= svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
|
||||
svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
|
||||
svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
|
||||
svfloat32_t r1
|
||||
= svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
|
||||
svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
|
||||
svfloat32_t r = svsel (cmp1, r1, r0);
|
||||
return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
|
||||
}
|
||||
|
||||
/* Single-precision vector exp2(x) - 1 function.
|
||||
The maximum error is 1.76 + 0.5 ULP.
|
||||
_ZGVsMxv_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
|
||||
want 0x1.ab2ecp-2. */
|
||||
svfloat32_t SV_NAME_F1 (exp2m1) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
svfloat32_t n = svrinta_x (pg, x);
|
||||
svfloat32_t r = svsub_x (pg, x, n);
|
||||
|
||||
svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
|
||||
svfloat32_t scale
|
||||
= svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
|
||||
|
||||
svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
|
||||
|
||||
svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
|
||||
svfloat32_t log2lo_c246 = svld1rq (svptrue_b32 (), &d->log2_lo);
|
||||
svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, log2lo_c246, 3);
|
||||
svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, log2lo_c246, 2);
|
||||
svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, log2lo_c246, 1);
|
||||
|
||||
svfloat32_t p36 = svmla_x (pg, p34, p56, r2);
|
||||
svfloat32_t p16 = svmla_x (pg, p12, p36, r2);
|
||||
|
||||
svfloat32_t poly = svmla_lane (
|
||||
svmul_x (svptrue_b32 (), r, sv_f32 (d->log2_hi)), r, log2lo_c246, 0);
|
||||
poly = svmla_x (pg, poly, p16, r2);
|
||||
|
||||
svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
@@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
|
||||
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
|
||||
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
|
||||
VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
|
||||
VPCS_VECTOR_WRAPPER (exp2m1_advsimd, _ZGVnN2v_exp2m1)
|
||||
VPCS_VECTOR_WRAPPER (exp10m1_advsimd, _ZGVnN2v_exp10m1)
|
||||
VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
|
||||
VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
|
||||
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
|
||||
|
||||
@@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
|
||||
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
|
||||
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
|
||||
SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
|
||||
SVE_VECTOR_WRAPPER (exp2m1_sve, _ZGVsMxv_exp2m1)
|
||||
SVE_VECTOR_WRAPPER (exp10m1_sve, _ZGVsMxv_exp10m1)
|
||||
SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
|
||||
SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
|
||||
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
|
||||
|
||||
@@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
|
||||
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
|
||||
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
|
||||
VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
|
||||
VPCS_VECTOR_WRAPPER (exp2m1f_advsimd, _ZGVnN4v_exp2m1f)
|
||||
VPCS_VECTOR_WRAPPER (exp10m1f_advsimd, _ZGVnN4v_exp10m1f)
|
||||
VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
|
||||
VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
|
||||
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
|
||||
|
||||
@@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
|
||||
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
|
||||
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
|
||||
SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
|
||||
SVE_VECTOR_WRAPPER (exp2m1f_sve, _ZGVsMxv_exp2m1f)
|
||||
SVE_VECTOR_WRAPPER (exp10m1f_sve, _ZGVsMxv_exp10m1f)
|
||||
SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
|
||||
SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
|
||||
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
|
||||
|
||||
@@ -168,3 +168,13 @@ GLIBC_2.42 _ZGVsMxv_atanpi F
|
||||
GLIBC_2.42 _ZGVsMxv_atanpif F
|
||||
GLIBC_2.42 _ZGVsMxvv_atan2pi F
|
||||
GLIBC_2.42 _ZGVsMxvv_atan2pif F
|
||||
GLIBC_2.43 _ZGVnN2v_exp10m1 F
|
||||
GLIBC_2.43 _ZGVnN2v_exp10m1f F
|
||||
GLIBC_2.43 _ZGVnN2v_exp2m1 F
|
||||
GLIBC_2.43 _ZGVnN2v_exp2m1f F
|
||||
GLIBC_2.43 _ZGVnN4v_exp10m1f F
|
||||
GLIBC_2.43 _ZGVnN4v_exp2m1f F
|
||||
GLIBC_2.43 _ZGVsMxv_exp10m1 F
|
||||
GLIBC_2.43 _ZGVsMxv_exp10m1f F
|
||||
GLIBC_2.43 _ZGVsMxv_exp2m1 F
|
||||
GLIBC_2.43 _ZGVsMxv_exp2m1f F
|
||||
|
||||
Reference in New Issue
Block a user