diff options
author | Ray Essick <essick@google.com> | 2019-09-04 09:09:52 -0700 |
---|---|---|
committer | Ray Essick <essick@google.com> | 2019-09-04 18:41:08 +0000 |
commit | 5d2e0a05b7df5aca4525cc547cd8538f3339b526 (patch) | |
tree | 92b213cf1c81a89895c63d1840c68c2e14208802 | |
parent | 385b87c83fa3b6be4016babfce40c63fc359e28a (diff) | |
download | libopus-5d2e0a05b7df5aca4525cc547cd8538f3339b526.tar.gz |
improve handling at end of bufferandroid-mainline-10.0.0_r9android-mainline-10.0.0_r7android-mainline-10.0.0_r5android-mainline-10.0.0_r4android-mainline-10.0.0_r10android-10.0.0_r45android-10.0.0_r44android-10.0.0_r43android-10.0.0_r42android-10.0.0_r41android-10.0.0_r40android-10.0.0_r39android-10.0.0_r38android-10.0.0_r37android-10.0.0_r36android-10.0.0_r35android-10.0.0_r34android-10.0.0_r33android-10.0.0_r32android-10.0.0_r31android-10.0.0_r30android10-qpr3-s1-releaseandroid10-qpr3-releaseandroid10-qpr2-s4-releaseandroid10-qpr2-s3-releaseandroid10-qpr2-s2-releaseandroid10-qpr2-s1-releaseandroid10-qpr2-releaseandroid10-qpr1-mainline-releaseandroid10-mainline-media-releaseandroid10-d4-s1-releaseandroid10-d4-release
a prior change reduced iterations through the input buffer to avoid the
NEON operations from overrunning the end of the locally allocated
buffer. While avoiding the overrun, it generated bad results.
Here we instead extend the locally allocated buffers enough that the
original iteration count won't overrun.
Some pre-existing bit-exact issues remain.
Bug: 136616344
Test: CTS + bit-exact cross-checks.
(cherry picked from commit aae866aed579da4e1c3299a1e9b94a1713a0decb)
Merged-In: Ifb790a7d1d09a4ce7da900b43e3fa1f7ab01ac53
Change-Id: Ia4a94c89979d6b3b0ddead135036aec40259f53a
-rw-r--r-- | silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c index ee06f986..6f3be025 100644 --- a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c +++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c @@ -84,7 +84,9 @@ void silk_warped_autocorrelation_FIX_neon( silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); - ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); + /* The additional +4 is to ensure a later vld1q_s32 call does not overflow. */ + /* Strictly, only +3 is needed but +4 simplifies initialization using the 4x32 neon load. */ + ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER + 4, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ @@ -121,6 +123,8 @@ void silk_warped_autocorrelation_FIX_neon( vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); + input_QS += 4; + vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ @@ -153,7 +157,8 @@ void silk_warped_autocorrelation_FIX_neon( opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; - ALLOC( state, length + orderT, opus_int32 ); + /* The additional +4 is to ensure a later vld1q_s32 call does not overflow. */ + ALLOC( state, length + order + 4, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ @@ -172,7 +177,7 @@ void silk_warped_autocorrelation_FIX_neon( state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; - } while( ++n < ( length + order - 3) ); + } while( ++n < ( length + order ) ); in = state; o -= 8; } while( o > 4 ); |