Math: Library: Add the hifi4 exponential function

The 32-bit HiFi4 exponential library function has an accuracy of 1e-4,
a unit in last place error of 5.60032793, and output ranges from
0.0067379470 to 148.4131591026 for inputs from -5 to +5 (Q4.28) (Q9.23).

Signed-off-by: ShriramShastry <malladi.sastry@intel.com>
This commit is contained in:
ShriramShastry 2023-05-17 19:15:10 +05:30 committed by Kai Vehmanen
parent 75e8f4b63c
commit c069197b63
7 changed files with 262 additions and 20 deletions

View File

@ -10,6 +10,18 @@
#include <stdint.h>
#if defined(__XCC__)
/* HiFi */
#include <xtensa/config/core-isa.h>
#if XCHAL_HAVE_HIFI4 == 1
#define SOFM_EXPONENTIAL_HIFI4 1
#endif
#else
/* !XCC */
#define EXPONENTIAL_GENERIC 1
#endif
int32_t sofm_exp_int32(int32_t x);
#endif

View File

@ -14,8 +14,8 @@ if(CONFIG_SQRT_FIXED)
add_local_sources(sof sqrt_int16.c)
endif()
if(CONFIG_EXP_FIXED)
add_local_sources(sof exp_fcn.c)
if(CONFIG_MATH_EXP)
add_local_sources(sof exp_fcn.c exp_fcn_hifi4.c)
endif()
if(CONFIG_MATH_DECIBELS)

View File

@ -38,13 +38,13 @@ config SQRT_FIXED
to calculate square root.square function having positive number
y as input and return the positive number x multiplied by itself (squared)
config EXP_FIXED
config MATH_EXP
bool "Exponential functions"
default n
help
By selecting this, the 32-bit sofm_exp_int32() function can be used to calculate
exponential values. With a maximum ulp of 5, an exponential function with
an input range of -5 to +5 y gives positive numbers between 0.00673794699908547 and
an input range of -5 to +5 gives positive numbers between 0.00673794699908547 and
148.413159102577. The precision of this function is 1e-4.
config NATURAL_LOGARITHM_FIXED

View File

@ -6,7 +6,6 @@
*
*/
/* Include Files */
#include <sof/math/exp_fcn.h>
#include <sof/math/numbers.h>
#include <sof/common.h>
@ -14,12 +13,14 @@
#include <stdbool.h>
#include <stdint.h>
#define BIT_MASK_Q62P2 0x4000000000000000LL
#define CONVERG_ERROR 28823037607936LL // error smaller than 1e-4,1/2 ^ -44.7122876200884
#define BIT_MASK_LOW_Q27P5 0x8000000
#define QUOTIENT_SCALE BIT(30)
#define TERMS_Q23P9 8388608
#define LSHIFT_BITS 8192
#if defined(EXPONENTIAL_GENERIC)
#define SOFM_BIT_MASK_Q62P2 0x4000000000000000LL
#define SOFM_CONVERG_ERROR 28823037607936LL // error smaller than 1e-4,1/2 ^ -44.7122876200884
#define SOFM_BIT_MASK_LOW_Q27P5 0x8000000
#define SOFM_QUOTIENT_SCALE BIT(30)
#define SOFM_TERMS_Q23P9 8388608
#define SOFM_LSHIFT_BITS 8192
/* inv multiplication lookup table */
/* LUT = ceil(1/factorial(b_n) * 2 ^ 63) */
@ -152,7 +153,7 @@ static inline int64_t lomul_s64_sr_sat_near(int64_t a, int64_t b)
uint64_t u64_rlo;
mul_s64(a, b, &u64_rhi, &u64_rlo);
const bool roundup = (u64_rlo & BIT_MASK_LOW_Q27P5) != 0;
const bool roundup = (u64_rlo & SOFM_BIT_MASK_LOW_Q27P5) != 0;
u64_rlo = (u64_rhi << 36 | u64_rlo >> 28) + (roundup ? 1 : 0);
return u64_rlo;
@ -179,8 +180,8 @@ int32_t sofm_exp_int32(int32_t x)
uint64_t ou0Lo;
int64_t qt;
int32_t b_n;
int32_t ts = TERMS_Q23P9; /* Q23.9 */
int64_t dividend = (x + LSHIFT_BITS) >> 14; /* x in Q50.14 */
int32_t ts = SOFM_TERMS_Q23P9; /* Q23.9 */
int64_t dividend = (x + SOFM_LSHIFT_BITS) >> 14; /* x in Q50.14 */
static const int32_t i_emin = -1342177280; /* Q4.28 */
static const int32_t o_emin = 56601; /* Q9.23 */
static const int32_t i_emax = 1342177280; /* Q4.28 */
@ -195,22 +196,23 @@ int32_t sofm_exp_int32(int32_t x)
return o_emax; /* 148.4131494760513306 in Q9.23 */
/* pre-computation of 1st & 2nd terms */
mul_s64(dividend, BIT_MASK_Q62P2, &ou0Hi, &ou0Lo);
mul_s64(dividend, SOFM_BIT_MASK_Q62P2, &ou0Hi, &ou0Lo);
qt = (ou0Hi << 46) | (ou0Lo >> 18);/* Q6.26 */
ts += (int32_t)((qt >> 35) + ((qt & QUOTIENT_SCALE) >> 18));
ts += (int32_t)((qt >> 35) + ((qt & SOFM_QUOTIENT_SCALE) >> 18));
dividend = lomul_s64_sr_sat_near(dividend, x);
for (b_n = 0; b_n < ARRAY_SIZE(exp_iv_ilookup); b_n++) {
mul_s64(dividend, exp_iv_ilookup[b_n], &ou0Hi, &ou0Lo);
qt = (ou0Hi << 45) | (ou0Lo >> 19);
/* sum of the remaining terms */
ts += (int32_t)((qt >> 35) + ((qt & QUOTIENT_SCALE) ? 1 : 0));
ts += (int32_t)((qt >> 35) + ((qt & SOFM_QUOTIENT_SCALE) ? 1 : 0));
dividend = lomul_s64_sr_sat_near(dividend, x);
qt = ABS(qt);
/* For inputs between -5 and 5, (qt < CONVERG_ERROR) is always true */
if (qt < CONVERG_ERROR)
/* For inputs between -5 and 5, (qt < SOFM_CONVERG_ERROR) is always true */
if (qt < SOFM_CONVERG_ERROR)
break;
}
return ts;
}
#endif

226
src/math/exp_fcn_hifi4.c Normal file
View File

@ -0,0 +1,226 @@
// SPDX-License-Identifier: BSD-3-Clause
/*
*Copyright(c) 2023 Intel Corporation. All rights reserved.
*
* Author: Shriram Shastry <malladi.sastry@linux.intel.com>
*
*/
#include <sof/common.h>
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
#if defined(SOFM_EXPONENTIAL_HIFI4)
#include <xtensa/tie/xt_hifi4.h>
#define SOFM_BIT_MASK_LOW_Q27P5 0x0000000008000000
#define SOFM_BIT_MASK_Q62P2 0x4000000000000000
#define SOFM_CONVERG_ERROR 0x1A36E2EB4000LL /* error smaller than 1e-4,1/2 ^ -44.7122876209085 */
#define SOFM_QUOTIENT_SCALE 0x400000000
#define SOFM_TERMS_Q23P9 0x800000
#define SOFM_LSHIFT_BITS BIT(13)
/*
* Arguments : int64_t in_0
* int64_t in_1
* uint64_t *ptroutbitshi
* uint64_t *ptroutbitslo
* Return Type : void
* Description:Perform element-wise multiplication on in_0 and in_1
* while keeping the required product word length and fractional
* length in mind. mul_s64 function divide the 64-bit quantities
* into two 32-bit words,multiply the low words to produce the
* lowest and second-lowest words in the result, then both pairs
* of low and high words from different numbers to produce the
* second and third lowest words in the result, and finally both
* high words to produce the two highest words in the outcome.
* Add them all up, taking carry into consideration.
*
* The 64 x 64 bit multiplication of operands in_0 and in_1 is
* shown in the image below. The 64-bit operand in_0,in_1 is
* represented by the notation in0_H, in1_H for the top 32 bits
* and in0_L, in1_L for the bottom 32 bits.
*
* in0_H : in0_L
* x in1_H : in1_L
* ---------------------
* P0 in0_L x in1_L
* P1 in0_H x in1_L 64 bit inner multiplication
* P2 in0_L x in1_H 64 bit inner multiplication
* P3 in0_H x in1_H
* --------------------
* [64 x 64 bit multiplication] sum of inner products
* All combinations are multiplied by one another and then added.
* Each inner product is moved into its proper power location.given the names
* of the inner products, redoing the addition where 000 represents 32 zero
* bits.The inner products can be added together in 64 bit addition.The sum
* of two 64-bit numbers yields a 65-bit output.
* (P0H:P0L)
* P1H(P1L:000)
* P2H(P2L:000)
* P3H:P3L(000:000)
* .......(aaa:P0L)
* By combining P0H:P0L and P1L:000. This can lead to a carry, denote as CRY0.
* The partial result is then multiplied by P2L:000.
* We call it CRY1 because it has the potential to carry again.
* (CRY0 + CRY1)P0H:P0L
* ( P1H)P1L:000
* ( P2H)P2L:000
* (P3H: P3L)000:000
* --------------------
* (ccc:bbb)aaa:P0L
* P1H, P2H, and P3H:P3L are added to the carry CRY0 + CRY1.This increase will
* not result in an overflow.
*
*/
static void mul_s64(ae_int64 in_0, ae_int64 in_1, ae_int64 *ptroutbitshi, ae_int64 *ptroutbitslo)
{
ae_int64 producthihi, producthilo, productlolo;
ae_int64 producthi, carry, product_hl_lh_h, product_hl_lh_l;
ae_int32x2 in0_32 = AE_MOVINT32X2_FROMINT64(in_0);
ae_int32x2 in1_32 = AE_MOVINT32X2_FROMINT64(in_1);
ae_ep ep_lolo = AE_MOVEA(0);
ae_ep ep_hilo = AE_MOVEA(0);
ae_ep ep_HL_LH = AE_MOVEA(0);
producthihi = AE_MUL32_HH(in0_32, in1_32);
/* AE_MULZAAD32USEP.HL.LH - Unsigned lower parts and signed higher 32-bit parts dual */
/* multiply and accumulate operation on two 32x2 operands with 72-bit output */
/* Input-32x32-bit(in1_32xin0_32)into 72-bit multiplication operations */
/* Output-lower 64 bits of the result are stored in producthilo */
/* Output-upper eight bits are stored in ep_hilo */
AE_MULZAAD32USEP_HL_LH(ep_hilo, producthilo, in1_32, in0_32);
productlolo = AE_MUL32U_LL(in0_32, in1_32);
product_hl_lh_h = AE_SRAI72(ep_hilo, producthilo, 32);
product_hl_lh_l = AE_SLAI64(producthilo, 32);
/* The AE_ADD72 procedure adds two 72-bit elements. The first 72-bit value is created */
/* by concatenating the MSBs and LSBs of operands ep[7:0] and d[63:0]. Similarly, the */
/* second value is created by concatenating bits from operands ep1[7:0] and d1[63:0]. */
AE_ADD72(ep_lolo, productlolo, ep_HL_LH, product_hl_lh_l);
carry = AE_SRAI72(ep_lolo, productlolo, 32);
carry = AE_SRLI64(carry, 32);
producthi = AE_ADD64(producthihi, carry);
producthi = AE_ADD64(producthi, product_hl_lh_h);
*ptroutbitslo = productlolo;
*ptroutbitshi = producthi;
}
/*
* Arguments : int64_t a
* int64_t b
* Return Type : int64_t
*/
static int64_t mul_s64_sr_sat_near(int64_t a, int64_t b)
{
ae_int64 result;
ae_int64 u64_chi;
ae_int64 u64_clo;
ae_int64 temp;
mul_s64(a, b, &u64_chi, &u64_clo);
ae_int64 roundup = AE_AND64(u64_clo, SOFM_SOFM_BIT_MASK_LOW_Q27P5);
roundup = AE_SRLI64(roundup, 27);
temp = AE_OR64(AE_SLAI64(u64_chi, 36), AE_SRLI64(u64_clo, 28));
return AE_ADD64(temp, roundup);
}
static ae_int64 onebyfact_Q63[19] = {
4611686018427387904LL,
1537228672809129301LL,
384307168202282325LL,
76861433640456465LL,
12810238940076077LL,
1830034134296582LL,
228754266787072LL,
25417140754119LL,
2541714075411LL,
231064915946LL,
19255409662LL,
1481185358LL,
105798954LL,
7053264LL,
440829LL,
25931LL,
1441LL,
76LL,
4LL
};
/* f(x) = a^x, x is variable and a is base
*
* Arguments : int32_t x(Q4.28)
* input range : -5 to 5
*
* Return Type : int32_t ts(Q9.23)
* output range 0.0067465305 to 148.4131488800
*+------------------+-----------------+--------+--------+
*| x | ts (returntype) | x | ts |
*+----+-----+-------+----+----+-------+--------+--------+
*|WLen| FLen|Signbit|WLen|FLen|Signbit| Qformat| Qformat|
*+----+-----+-------+----+----+-------+--------+--------+
*| 32 | 28 | 1 | 32 | 23 | 0 | 4.28 | 9.23 |
*+------------------+-----------------+--------+--------+
*/
int32_t sofm_exp_int32(int32_t x)
{
ae_int64 outhi;
ae_int64 outlo;
ae_int64 qt;
ae_int64 onebyfact;
ae_int64 temp;
ae_int64 *ponebyfact_Q63 = &onebyfact_Q63[0];
ae_int64 ts = SOFM_TERMS_Q23P9;
ae_int64 mp = (x + SOFM_LSHIFT_BITS) >> 14; /* x in Q50.14 */;
xtbool flag;
int64_t qt_temp;
int64_t b_n;
mul_s64(mp, SOFM_BIT_MASK_Q62P2, &outhi, &outlo);
qt = AE_OR64(AE_SLAI64(outhi, 46), AE_SRLI64(outlo, 18));
temp = AE_SRAI64(AE_ADD64(qt, SOFM_QUOTIENT_SCALE), 35);
ts = AE_ADD64(ts, temp);
mp = mul_s64_sr_sat_near(mp, (int64_t)x);
for (b_n = 0; b_n < 64;) {
AE_L64_IP(onebyfact, ponebyfact_Q63, 8);
mul_s64(mp, onebyfact, &outhi, &outlo);
qt = AE_OR64(AE_SLAI64(outhi, 45), AE_SRLI64(outlo, 19));
temp = AE_SRAI64(AE_ADD64(qt, SOFM_QUOTIENT_SCALE), 35);
ts = AE_ADD64(ts, temp);
mp = mul_s64_sr_sat_near(mp, (int64_t)x);
const ae_int64 sign = AE_NEG64(qt);
flag = AE_LT64(qt, 0);
AE_MOVT64(qt, sign, flag);
if (!(qt < (ae_int64)SOFM_CONVERG_ERROR))
b_n++;
else
b_n = 64;
}
return AE_MOVAD32_L(AE_MOVINT32X2_FROMINT64(ts));
}
#endif

View File

@ -13,6 +13,7 @@ cmocka_test(base2_logarithm
cmocka_test(exponential
exponential.c
${PROJECT_SOURCE_DIR}/src/math/exp_fcn.c
${PROJECT_SOURCE_DIR}/src/math/exp_fcn_hifi4.c
)
cmocka_test(square_root
square_root.c

View File

@ -700,8 +700,9 @@ zephyr_library_sources_ifdef(CONFIG_SQRT_FIXED
${SOF_MATH_PATH}/sqrt_int16.c
)
zephyr_library_sources_ifdef(CONFIG_EXP_FIXED
zephyr_library_sources_ifdef(CONFIG_MATH_EXP
${SOF_MATH_PATH}/exp_fcn.c
${SOF_MATH_PATH}/exp_fcn_hifi4.c
)
zephyr_library_sources_ifdef(CONFIG_COMP_UP_DOWN_MIXER