From 95d4c9f960c9469961781c923ccfdb8c3eba0562 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 13 Jul 2016 17:25:49 -0700 Subject: Optimize silk_LPC_inverse_pred_gain() for ARM NEON The optimization is bit exact with C function. Change-Id: Ib3bdc26a5a4ebe02e7f24be85104e8e9a2a9a738 Signed-off-by: Jean-Marc Valin --- silk/CNG.c | 2 +- silk/LPC_inv_pred_gain.c | 6 +- silk/NLSF2A.c | 5 +- silk/PLC.c | 2 +- silk/SigProc_FIX.h | 13 +- silk/arm/LPC_inv_pred_gain_arm.h | 57 +++++ silk/arm/LPC_inv_pred_gain_neon_intr.c | 280 ++++++++++++++++++++++ silk/arm/arm_silk_map.c | 11 + silk/decode_parameters.c | 4 +- silk/fixed/find_LPC_FIX.c | 2 +- silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h | 4 +- silk/float/find_LPC_FLP.c | 2 +- silk/float/main_FLP.h | 3 +- silk/float/wrappers_FLP.c | 5 +- silk/init_decoder.c | 1 + silk/process_NLSFs.c | 4 +- silk/structs.h | 1 + silk/tests/test_unit_LPC_inv_pred_gain.c | 3 +- 18 files changed, 384 insertions(+), 21 deletions(-) create mode 100644 silk/arm/LPC_inv_pred_gain_arm.h create mode 100644 silk/arm/LPC_inv_pred_gain_neon_intr.c (limited to 'silk') diff --git a/silk/CNG.c b/silk/CNG.c index d140db7d..e6d9b861 100644 --- a/silk/CNG.c +++ b/silk/CNG.c @@ -142,7 +142,7 @@ void silk_CNG( silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, length, &psCNG->rand_seed ); /* Convert CNG NLSF to filter representation */ - silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order ); + silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order, psDec->arch ); /* Generate CNG signal, by synthesis filtering */ silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) ); diff --git a/silk/LPC_inv_pred_gain.c b/silk/LPC_inv_pred_gain.c index b5bd5717..a3746a6e 100644 --- a/silk/LPC_inv_pred_gain.c +++ b/silk/LPC_inv_pred_gain.c @@ -39,7 +39,7 @@ POSSIBILITY OF SUCH DAMAGE. /* Compute inverse of LPC prediction gain, and */ /* test if LPC coefficients are stable (all poles within unit circle) */ -static opus_int32 LPC_inverse_pred_gain_QA( /* O Returns inverse prediction gain in energy domain, Q30 */ +static opus_int32 LPC_inverse_pred_gain_QA_c( /* O Returns inverse prediction gain in energy domain, Q30 */ opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */ const opus_int order /* I Prediction order */ ) @@ -119,7 +119,7 @@ static opus_int32 LPC_inverse_pred_gain_QA( /* O Returns inver } /* For input in Q12 domain */ -opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */ +opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ const opus_int order /* I Prediction order */ ) @@ -137,5 +137,5 @@ opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse predi if( DC_resp >= 4096 ) { return 0; } - return LPC_inverse_pred_gain_QA( Atmp_QA, order ); + return LPC_inverse_pred_gain_QA_c( Atmp_QA, order ); } diff --git a/silk/NLSF2A.c b/silk/NLSF2A.c index 0ea5c17e..116b465b 100644 --- a/silk/NLSF2A.c +++ b/silk/NLSF2A.c @@ -66,7 +66,8 @@ static OPUS_INLINE void silk_NLSF2A_find_poly( void silk_NLSF2A( opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */ const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */ - const opus_int d /* I filter order (should be even) */ + const opus_int d, /* I filter order (should be even) */ + int arch /* I Run-time architecture */ ) { /* This ordering was found to maximize quality. It improves numerical accuracy of @@ -128,7 +129,7 @@ void silk_NLSF2A( /* Convert int32 coefficients to Q12 int16 coefs */ silk_LPC_fit( a_Q12, a32_QA1, 12, QA + 1, d ); - for( i = 0; silk_LPC_inverse_pred_gain( a_Q12, d ) == 0 && i < MAX_LPC_STABILIZE_ITERATIONS; i++ ) { + for( i = 0; silk_LPC_inverse_pred_gain( a_Q12, d, arch ) == 0 && i < MAX_LPC_STABILIZE_ITERATIONS; i++ ) { /* Prediction coefficients are (too close to) unstable; apply bandwidth expansion */ /* on the unscaled coefficients, convert to Q12 and measure again */ silk_bwexpander_32( a32_QA1, d, 65536 - silk_LSHIFT( 2, i ) ); diff --git a/silk/PLC.c b/silk/PLC.c index 277037a9..a3e55ea7 100644 --- a/silk/PLC.c +++ b/silk/PLC.c @@ -275,7 +275,7 @@ static OPUS_INLINE void silk_PLC_conceal( /* Reduce random noise for unvoiced frames with high LPC gain */ opus_int32 invGain_Q30, down_scale_Q30; - invGain_Q30 = silk_LPC_inverse_pred_gain( psPLC->prevLPC_Q12, psDec->LPC_order ); + invGain_Q30 = silk_LPC_inverse_pred_gain( psPLC->prevLPC_Q12, psDec->LPC_order, arch ); down_scale_Q30 = silk_min_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_HIGH_THRES ), invGain_Q30 ); down_scale_Q30 = silk_max_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_LOW_THRES ), down_scale_Q30 ); diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h index 43eeb364..e0c39679 100644 --- a/silk/SigProc_FIX.h +++ b/silk/SigProc_FIX.h @@ -47,6 +47,10 @@ extern "C" #include "x86/SigProc_FIX_sse.h" #endif +#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)) +#include "arm/LPC_inv_pred_gain_arm.h" +#endif + /********************************************************************/ /* SIGNAL PROCESSING FUNCTIONS */ /********************************************************************/ @@ -132,7 +136,7 @@ void silk_bwexpander_32( /* Compute inverse of LPC prediction gain, and */ /* test if LPC coefficients are stable (all poles within unit circle) */ -opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */ +opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ const opus_int order /* I Prediction order */ ); @@ -146,6 +150,10 @@ void silk_ana_filt_bank_1( const opus_int32 N /* I Number of input samples */ ); +#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain) +#define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), silk_LPC_inverse_pred_gain_c(A_Q12, order)) +#endif + /********************************************************************/ /* SCALAR FUNCTIONS */ /********************************************************************/ @@ -265,7 +273,8 @@ void silk_A2NLSF( void silk_NLSF2A( opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */ const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */ - const opus_int d /* I filter order (should be even) */ + const opus_int d, /* I filter order (should be even) */ + int arch /* I Run-time architecture */ ); /* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */ diff --git a/silk/arm/LPC_inv_pred_gain_arm.h b/silk/arm/LPC_inv_pred_gain_arm.h new file mode 100644 index 00000000..9895b555 --- /dev/null +++ b/silk/arm/LPC_inv_pred_gain_arm.h @@ -0,0 +1,57 @@ +/*********************************************************************** +Copyright (c) 2017 Google Inc. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifndef SILK_LPC_INV_PRED_GAIN_ARM_H +# define SILK_LPC_INV_PRED_GAIN_ARM_H + +# include "celt/arm/armcpu.h" + +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +); + +# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON) +# define OVERRIDE_silk_LPC_inverse_pred_gain (1) +# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order)) +# endif +# endif + +# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain) +/*Is run-time CPU detection enabled on this platform?*/ +# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const opus_int order); +# define OVERRIDE_silk_LPC_inverse_pred_gain (1) +# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order)) +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_silk_LPC_inverse_pred_gain (1) +# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), silk_LPC_inverse_pred_gain_neon(A_Q12, order)) +# endif +# endif + +#endif /* end SILK_LPC_INV_PRED_GAIN_ARM_H */ diff --git a/silk/arm/LPC_inv_pred_gain_neon_intr.c b/silk/arm/LPC_inv_pred_gain_neon_intr.c new file mode 100644 index 00000000..27142f34 --- /dev/null +++ b/silk/arm/LPC_inv_pred_gain_neon_intr.c @@ -0,0 +1,280 @@ +/*********************************************************************** +Copyright (c) 2017 Google Inc. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +- Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +- Neither the name of Internet Society, IETF or IETF Trust, nor the +names of specific contributors, may be used to endorse or promote +products derived from this software without specific prior written +permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "SigProc_FIX.h" +#include "define.h" + +#define QA 24 +#define A_LIMIT SILK_FIX_CONST( 0.99975, QA ) + +#define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q))) + +/* The difficulty is how to judge a 64-bit signed integer tmp64 is 32-bit overflowed, + * since NEON has no 64-bit min, max or comparison instructions. + * A failed idea is to compare the results of vmovn(tmp64) and vqmovn(tmp64) whether they are equal or not. + * However, this idea fails when the tmp64 is something like 0xFFFFFFF980000000. + * Here we know that mult2Q >= 1, so the highest bit (bit 63, sign bit) of tmp64 must equal to bit 62. + * tmp64 was shifted left by 1 and we got tmp64'. If high_half(tmp64') != 0 and high_half(tmp64') != -1, + * then we know that bit 31 to bit 63 of tmp64 can not all be the sign bit, and therefore tmp64 is 32-bit overflowed. + * That is, we judge if tmp64' > 0x00000000FFFFFFFF, or tmp64' <= 0xFFFFFFFF00000000. + * We use narrowing shift right 31 bits to tmp32' to save data bandwidth and instructions. + * That is, we judge if tmp32' > 0x00000000, or tmp32' <= 0xFFFFFFFF. + */ + +/* Compute inverse of LPC prediction gain, and */ +/* test if LPC coefficients are stable (all poles within unit circle) */ +static OPUS_INLINE opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */ + const opus_int order /* I Prediction order */ +) +{ + opus_int k, n, mult2Q; + opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2, tmp1, tmp2; + opus_int32 max, min; + int32x4_t max_s32x4, min_s32x4; + int32x2_t max_s32x2, min_s32x2; + + max_s32x4 = vdupq_n_s32( silk_int32_MIN ); + min_s32x4 = vdupq_n_s32( silk_int32_MAX ); + invGain_Q30 = SILK_FIX_CONST( 1, 30 ); + for( k = order - 1; k > 0; k-- ) { + int32x2_t rc_Q31_s32x2, rc_mult2_s32x2; + int64x2_t mult2Q_s64x2; + + /* Check for stability */ + if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) { + return 0; + } + + /* Set RC equal to negated AR coef */ + rc_Q31 = -silk_LSHIFT( A_QA[ k ], 31 - QA ); + + /* rc_mult1_Q30 range: [ 1 : 2^30 ] */ + rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) ); + silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */ + silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) ); + + /* Update inverse gain */ + /* invGain_Q30 range: [ 0 : 2^30 ] */ + invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); + silk_assert( invGain_Q30 >= 0 ); + silk_assert( invGain_Q30 <= ( 1 << 30 ) ); + if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) { + return 0; + } + + /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */ + mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) ); + rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 ); + + /* Update AR coefficient */ + rc_Q31_s32x2 = vdup_n_s32( rc_Q31 ); + mult2Q_s64x2 = vdupq_n_s64( -mult2Q ); + rc_mult2_s32x2 = vdup_n_s32( rc_mult2 ); + + for( n = 0; n < ( ( k + 1 ) >> 1 ) - 3; n += 4 ) { + /* We always calculate extra elements of A_QA buffer when ( k % 4 ) != 0, to take the advantage of SIMD parallelization. */ + int32x4_t tmp1_s32x4, tmp2_s32x4, t0_s32x4, t1_s32x4, s0_s32x4, s1_s32x4, t_QA0_s32x4, t_QA1_s32x4; + int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2; + tmp1_s32x4 = vld1q_s32( A_QA + n ); + tmp2_s32x4 = vld1q_s32( A_QA + k - n - 4 ); + tmp2_s32x4 = vrev64q_s32( tmp2_s32x4 ); + tmp2_s32x4 = vcombine_s32( vget_high_s32( tmp2_s32x4 ), vget_low_s32( tmp2_s32x4 ) ); + t0_s32x4 = vqrdmulhq_lane_s32( tmp2_s32x4, rc_Q31_s32x2, 0 ); + t1_s32x4 = vqrdmulhq_lane_s32( tmp1_s32x4, rc_Q31_s32x2, 0 ); + t_QA0_s32x4 = vqsubq_s32( tmp1_s32x4, t0_s32x4 ); + t_QA1_s32x4 = vqsubq_s32( tmp2_s32x4, t1_s32x4 ); + t0_s64x2 = vmull_s32( vget_low_s32 ( t_QA0_s32x4 ), rc_mult2_s32x2 ); + t1_s64x2 = vmull_s32( vget_high_s32( t_QA0_s32x4 ), rc_mult2_s32x2 ); + t2_s64x2 = vmull_s32( vget_low_s32 ( t_QA1_s32x4 ), rc_mult2_s32x2 ); + t3_s64x2 = vmull_s32( vget_high_s32( t_QA1_s32x4 ), rc_mult2_s32x2 ); + t0_s64x2 = vrshlq_s64( t0_s64x2, mult2Q_s64x2 ); + t1_s64x2 = vrshlq_s64( t1_s64x2, mult2Q_s64x2 ); + t2_s64x2 = vrshlq_s64( t2_s64x2, mult2Q_s64x2 ); + t3_s64x2 = vrshlq_s64( t3_s64x2, mult2Q_s64x2 ); + t0_s32x4 = vcombine_s32( vmovn_s64( t0_s64x2 ), vmovn_s64( t1_s64x2 ) ); + t1_s32x4 = vcombine_s32( vmovn_s64( t2_s64x2 ), vmovn_s64( t3_s64x2 ) ); + s0_s32x4 = vcombine_s32( vshrn_n_s64( t0_s64x2, 31 ), vshrn_n_s64( t1_s64x2, 31 ) ); + s1_s32x4 = vcombine_s32( vshrn_n_s64( t2_s64x2, 31 ), vshrn_n_s64( t3_s64x2, 31 ) ); + max_s32x4 = vmaxq_s32( max_s32x4, s0_s32x4 ); + min_s32x4 = vminq_s32( min_s32x4, s0_s32x4 ); + max_s32x4 = vmaxq_s32( max_s32x4, s1_s32x4 ); + min_s32x4 = vminq_s32( min_s32x4, s1_s32x4 ); + t1_s32x4 = vrev64q_s32( t1_s32x4 ); + t1_s32x4 = vcombine_s32( vget_high_s32( t1_s32x4 ), vget_low_s32( t1_s32x4 ) ); + vst1q_s32( A_QA + n, t0_s32x4 ); + vst1q_s32( A_QA + k - n - 4, t1_s32x4 ); + } + for( ; n < (k + 1) >> 1; n++ ) { + opus_int64 tmp64; + tmp1 = A_QA[ n ]; + tmp2 = A_QA[ k - n - 1 ]; + tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp1, + MUL32_FRAC_Q( tmp2, rc_Q31, 31 ) ), rc_mult2 ), mult2Q); + if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) { + return 0; + } + A_QA[ n ] = ( opus_int32 )tmp64; + tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp2, + MUL32_FRAC_Q( tmp1, rc_Q31, 31 ) ), rc_mult2), mult2Q); + if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) { + return 0; + } + A_QA[ k - n - 1 ] = ( opus_int32 )tmp64; + } + } + + /* Check for stability */ + if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) { + return 0; + } + + max_s32x2 = vmax_s32( vget_low_s32( max_s32x4 ), vget_high_s32( max_s32x4 ) ); + min_s32x2 = vmin_s32( vget_low_s32( min_s32x4 ), vget_high_s32( min_s32x4 ) ); + max_s32x2 = vmax_s32( max_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( max_s32x2 ), 32 ) ) ); + min_s32x2 = vmin_s32( min_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( min_s32x2 ), 32 ) ) ); + max = vget_lane_s32( max_s32x2, 0 ); + min = vget_lane_s32( min_s32x2, 0 ); + if( ( max > 0 ) || ( min < -1 ) ) { + return 0; + } + + /* Set RC equal to negated AR coef */ + rc_Q31 = -silk_LSHIFT( A_QA[ 0 ], 31 - QA ); + + /* Range: [ 1 : 2^30 ] */ + rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) ); + + /* Update inverse gain */ + /* Range: [ 0 : 2^30 ] */ + invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 ); + silk_assert( invGain_Q30 >= 0 ); + silk_assert( invGain_Q30 <= ( 1 << 30 ) ); + if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) { + return 0; + } + + return invGain_Q30; +} + +/* For input in Q12 domain */ +opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +) +{ +#ifdef OPUS_CHECK_ASM + const opus_int32 invGain_Q30_c = silk_LPC_inverse_pred_gain_c( A_Q12, order ); +#endif + + opus_int32 invGain_Q30; + if( ( SILK_MAX_ORDER_LPC != 24 ) || ( order & 1 )) { + invGain_Q30 = silk_LPC_inverse_pred_gain_c( A_Q12, order ); + } + else { + opus_int32 Atmp_QA[ SILK_MAX_ORDER_LPC ]; + opus_int32 DC_resp; + int16x8_t t0_s16x8, t1_s16x8, t2_s16x8; + int32x4_t t0_s32x4; + const opus_int leftover = order & 7; + + /* Increase Q domain of the AR coefficients */ + t0_s16x8 = vld1q_s16( A_Q12 + 0 ); + t1_s16x8 = vld1q_s16( A_Q12 + 8 ); + t2_s16x8 = vld1q_s16( A_Q12 + 16 ); + t0_s32x4 = vpaddlq_s16( t0_s16x8 ); + + switch( order - leftover ) + { + case 24: + t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 ); + /* Intend to fall through */ + + case 16: + t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 ); + vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) ); + /* Intend to fall through */ + + case 8: + { + const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) ); + const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 ); + DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 ); + vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) ); + } + break; + + default: + DC_resp = 0; + break; + } + A_Q12 += order - leftover; + + switch( leftover ) + { + case 6: + DC_resp += (opus_int32)A_Q12[ 5 ]; + DC_resp += (opus_int32)A_Q12[ 4 ]; + /* Intend to fall through */ + + case 4: + DC_resp += (opus_int32)A_Q12[ 3 ]; + DC_resp += (opus_int32)A_Q12[ 2 ]; + /* Intend to fall through */ + + case 2: + DC_resp += (opus_int32)A_Q12[ 1 ]; + DC_resp += (opus_int32)A_Q12[ 0 ]; + /* Intend to fall through */ + + default: + break; + } + + /* If the DC is unstable, we don't even need to do the full calculations */ + if( DC_resp >= 4096 ) { + invGain_Q30 = 0; + } else { + vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) ); + vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) ); + invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order ); + } + } + +#ifdef OPUS_CHECK_ASM + silk_assert( invGain_Q30_c == invGain_Q30 ); +#endif + + return invGain_Q30; +} diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c index 38bdd4b9..0966cfde 100644 --- a/silk/arm/arm_silk_map.c +++ b/silk/arm/arm_silk_map.c @@ -30,12 +30,23 @@ POSSIBILITY OF SUCH DAMAGE. #include "main_FIX.h" #include "NSQ.h" +#include "SigProc_FIX.h" #if defined(OPUS_HAVE_RTCD) # if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \ !defined(OPUS_ARM_PRESUME_NEON_INTR)) +opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */ + const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */ + const opus_int order /* I Prediction order */ +) = { + silk_LPC_inverse_pred_gain_c, /* ARMv4 */ + silk_LPC_inverse_pred_gain_c, /* EDSP */ + silk_LPC_inverse_pred_gain_c, /* Media */ + MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */ +}; + void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, /* I Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ diff --git a/silk/decode_parameters.c b/silk/decode_parameters.c index e345b1dc..a56a4098 100644 --- a/silk/decode_parameters.c +++ b/silk/decode_parameters.c @@ -52,7 +52,7 @@ void silk_decode_parameters( silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices, psDec->psNLSF_CB ); /* Convert NLSF parameters to AR prediction filter coefficients */ - silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order ); + silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order, psDec->arch ); /* If just reset, e.g., because internal Fs changed, do not allow interpolation */ /* improves the case of packet loss in the first frame after a switch */ @@ -69,7 +69,7 @@ void silk_decode_parameters( } /* Convert NLSF parameters to AR prediction filter coefficients */ - silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order ); + silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order, psDec->arch ); } else { /* Copy LPC coefficients for first half from second half */ silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ], psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) ); diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c index e11cdc86..e55b63ac 100644 --- a/silk/fixed/find_LPC_FIX.c +++ b/silk/fixed/find_LPC_FIX.c @@ -92,7 +92,7 @@ void silk_find_LPC_FIX( silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder ); /* Convert to LPC for residual energy evaluation */ - silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch ); /* Calculate residual energy with NLSF interpolation */ silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch ); diff --git a/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h index c30481e4..f9b5473f 100644 --- a/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h +++ b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h @@ -224,8 +224,8 @@ void silk_noise_shape_analysis_FIX( silk_bwexpander_32( AR1_Q24, psEnc->sCmn.shapingLPCOrder, BWExp1_Q16 ); /* Ratio of prediction gains, in energy domain */ - pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder ); - nrg = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder ); + pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder, arch ); + nrg = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder, arch ); /*psEncCtrl->GainsPre[ k ] = 1.0f - 0.7f * ( 1.0f - pre_nrg / nrg ) = 0.3f + 0.7f * pre_nrg / nrg;*/ pre_nrg_Q30 = silk_LSHIFT32( silk_SMULWB( pre_nrg_Q30, SILK_FIX_CONST( 0.7, 15 ) ), 1 ); diff --git a/silk/float/find_LPC_FLP.c b/silk/float/find_LPC_FLP.c index fcfe1c36..4d63964f 100644 --- a/silk/float/find_LPC_FLP.c +++ b/silk/float/find_LPC_FLP.c @@ -73,7 +73,7 @@ void silk_find_LPC_FLP( silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder ); /* Convert to LPC for residual energy evaluation */ - silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch ); /* Calculate residual energy with LSF interpolation */ silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length, psEncC->predictLPCOrder ); diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h index 8d1d2a8a..f47fc93b 100644 --- a/silk/float/main_FLP.h +++ b/silk/float/main_FLP.h @@ -256,7 +256,8 @@ void silk_A2NLSF_FLP( void silk_NLSF2A_FLP( silk_float *pAR, /* O LPC coefficients [ LPC_order ] */ const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */ - const opus_int LPC_order /* I LPC order */ + const opus_int LPC_order, /* I LPC order */ + int arch /* I Run-time architecture */ ); /* Limit, stabilize, and quantize NLSFs */ diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c index e8f43831..ad90b874 100644 --- a/silk/float/wrappers_FLP.c +++ b/silk/float/wrappers_FLP.c @@ -54,13 +54,14 @@ void silk_A2NLSF_FLP( void silk_NLSF2A_FLP( silk_float *pAR, /* O LPC coefficients [ LPC_order ] */ const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */ - const opus_int LPC_order /* I LPC order */ + const opus_int LPC_order, /* I LPC order */ + int arch /* I Run-time architecture */ ) { opus_int i; opus_int16 a_fix_Q12[ MAX_LPC_ORDER ]; - silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order ); + silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order, arch ); for( i = 0; i < LPC_order; i++ ) { pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f ); diff --git a/silk/init_decoder.c b/silk/init_decoder.c index f887c678..16c03dcd 100644 --- a/silk/init_decoder.c +++ b/silk/init_decoder.c @@ -44,6 +44,7 @@ opus_int silk_init_decoder( /* Used to deactivate LSF interpolation */ psDec->first_frame_after_reset = 1; psDec->prev_gain_Q16 = 65536; + psDec->arch = opus_select_arch(); /* Reset CNG state */ silk_CNG_Reset( psDec ); diff --git a/silk/process_NLSFs.c b/silk/process_NLSFs.c index 0ab71f01..2f10f8df 100644 --- a/silk/process_NLSFs.c +++ b/silk/process_NLSFs.c @@ -89,7 +89,7 @@ void silk_process_NLSFs( NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType ); /* Convert quantized NLSFs back to LPC coefficients */ - silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder, psEncC->arch ); if( doInterpolate ) { /* Calculate the interpolated, quantized LSF vector for the first half */ @@ -97,7 +97,7 @@ void silk_process_NLSFs( psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder ); /* Convert back to LPC coefficients */ - silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder ); + silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder, psEncC->arch ); } else { /* Copy LPC coefficients for first half from second half */ diff --git a/silk/structs.h b/silk/structs.h index f7c9652f..4ff590b0 100644 --- a/silk/structs.h +++ b/silk/structs.h @@ -301,6 +301,7 @@ typedef struct { /* Stuff used for PLC */ opus_int lossCnt; opus_int prevSignalType; + int arch; silk_PLC_struct sPLC; diff --git a/silk/tests/test_unit_LPC_inv_pred_gain.c b/silk/tests/test_unit_LPC_inv_pred_gain.c index 2a22bf62..69bf6b72 100644 --- a/silk/tests/test_unit_LPC_inv_pred_gain.c +++ b/silk/tests/test_unit_LPC_inv_pred_gain.c @@ -78,6 +78,7 @@ int check_stability(opus_int16 *A_Q12, int order) { } int main(void) { + const int arch = opus_select_arch(); /* Set to 10000 so all branches in C function are triggered */ const int loop_num = 10000; int count = 0; @@ -100,7 +101,7 @@ int main(void) { for( i = 0; i < SILK_MAX_ORDER_LPC; i++ ) { A_Q12[i] = ((opus_int16)rand()) >> shift; } - gain = silk_LPC_inverse_pred_gain(A_Q12, order); + gain = silk_LPC_inverse_pred_gain(A_Q12, order, arch); /* Look for filters that silk_LPC_inverse_pred_gain() thinks are stable but definitely aren't. */ if( gain != 0 && !check_stability(A_Q12, order) ) { -- cgit v1.2.3