From 19c5406cde4755130d484949e0bdc2bff2a0d770 Mon Sep 17 00:00:00 2001 From: Viswanath Puttagunta Date: Fri, 15 May 2015 12:42:20 -0500 Subject: armv7(float): Optimize decode usecase using NE10 library Optimize opus decode (float only) use case using ARM NE10. Mainly effects opus_ifft and ctl_mdct_backward and related functions. Work based on previous Encode optimization using ARM NE10 library. See previous commit for details on how to enable this. Signed-off-by: Timothy B. Terriberry --- celt/arm/arm_celt_map.c | 22 ++++++++++ celt/arm/celt_ne10_fft.c | 26 ++++++++++++ celt/arm/celt_ne10_mdct.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++ celt/arm/fft_arm.h | 8 ++++ celt/arm/mdct_arm.h | 7 ++++ 5 files changed, 165 insertions(+) (limited to 'celt/arm') diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index 87ba3b3e..4c2d28ce 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -79,6 +79,15 @@ void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, opus_fft_float_neon /* Neon with NE10 */ }; +void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg, + const kiss_fft_cpx *fin, + kiss_fft_cpx *fout) = { + opus_ifft_c, /* ARMv4 */ + opus_ifft_c, /* EDSP */ + opus_ifft_c, /* Media */ + opus_ifft_float_neon /* Neon with NE10 */ +}; + void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, @@ -90,6 +99,19 @@ void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, clt_mdct_forward_c, /* Media */ clt_mdct_forward_float_neon /* Neon with NE10 */ }; + +void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l, + kiss_fft_scalar *in, + kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 *window, + int overlap, int shift, + int stride, int arch) = { + clt_mdct_backward_c, /* ARMv4 */ + clt_mdct_backward_c, /* EDSP */ + clt_mdct_backward_c, /* Media */ + clt_mdct_backward_float_neon /* Neon with NE10 */ +}; + # endif /* HAVE_ARM_NE10 */ # endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */ # endif /* FIXED_POINT */ diff --git a/celt/arm/celt_ne10_fft.c b/celt/arm/celt_ne10_fft.c index fc4b0da0..2ba8c559 100644 --- a/celt/arm/celt_ne10_fft.c +++ b/celt/arm/celt_ne10_fft.c @@ -118,4 +118,30 @@ void opus_fft_float_neon(const kiss_fft_state *st, } RESTORE_STACK; } + +void opus_ifft_float_neon(const kiss_fft_state *st, + const kiss_fft_cpx *fin, + kiss_fft_cpx *fout) +{ + ne10_fft_state_float32_t state; + ne10_fft_cfg_float32_t cfg = &state; + VARDECL(ne10_fft_cpx_float32_t, buffer); + SAVE_STACK; + ALLOC(buffer, st->nfft, ne10_fft_cpx_float32_t); + + if (!st->arch_fft->is_supported) { + /* This nfft length (scaled fft) not supported in NE10 */ + opus_ifft_c(st, fin, fout); + } + else { + memcpy((void *)cfg, st->arch_fft->priv, sizeof(ne10_fft_state_float32_t)); + state.buffer = (ne10_fft_cpx_float32_t *)&buffer[0]; + state.is_backward_scaled = 0; + + ne10_fft_c2c_1d_float32_neon((ne10_fft_cpx_float32_t *)fout, + (ne10_fft_cpx_float32_t *)fin, + cfg, 1); + } + RESTORE_STACK; +} #endif /* !defined(FIXED_POINT) */ diff --git a/celt/arm/celt_ne10_mdct.c b/celt/arm/celt_ne10_mdct.c index 1c6e9158..a68264e2 100644 --- a/celt/arm/celt_ne10_mdct.c +++ b/celt/arm/celt_ne10_mdct.c @@ -156,4 +156,106 @@ void clt_mdct_forward_float_neon(const mdct_lookup *l, } RESTORE_STACK; } + +void clt_mdct_backward_float_neon(const mdct_lookup *l, + kiss_fft_scalar *in, + kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 * OPUS_RESTRICT window, + int overlap, int shift, int stride, int arch) +{ + int i; + int N, N2, N4; + VARDECL(kiss_fft_scalar, f); + const kiss_twiddle_scalar *trig; + const kiss_fft_state *st = l->kfft[shift]; + + N = l->n; + trig = l->trig; + for (i=0;i>= 1; + trig += N; + } + N2 = N>>1; + N4 = N>>2; + + ALLOC(f, N2, kiss_fft_scalar); + + /* Pre-rotate */ + { + /* Temp pointers to make it really clear to the compiler what we're doing */ + const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; + const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); + kiss_fft_scalar * OPUS_RESTRICT yp = f; + const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; + for(i=0;i>1)), arch); + + /* Post-rotate and de-shuffle from both ends of the buffer at once to make + it in-place. */ + { + kiss_fft_scalar * yp0 = out+(overlap>>1); + kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; + const kiss_twiddle_scalar *t = &trig[0]; + /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the + middle pair will be computed twice. */ + for(i=0;i<(N4+1)>>1;i++) + { + kiss_fft_scalar re, im, yr, yi; + kiss_twiddle_scalar t0, t1; + re = yp0[0]; + im = yp0[1]; + t0 = t[i]; + t1 = t[N4+i]; + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ + yr = S_MUL(re,t0) + S_MUL(im,t1); + yi = S_MUL(re,t1) - S_MUL(im,t0); + re = yp1[0]; + im = yp1[1]; + yp0[0] = yr; + yp1[1] = yi; + + t0 = t[(N4-i-1)]; + t1 = t[(N2-i-1)]; + /* We'd scale up by 2 here, but instead it's done when mixing the windows */ + yr = S_MUL(re,t0) + S_MUL(im,t1); + yi = S_MUL(re,t1) - S_MUL(im,t0); + yp1[0] = yr; + yp0[1] = yi; + yp0 += 2; + yp1 -= 2; + } + } + + /* Mirror on both sides for TDAC */ + { + kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; + kiss_fft_scalar * OPUS_RESTRICT yp1 = out; + const opus_val16 * OPUS_RESTRICT wp1 = window; + const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1; + + for(i = 0; i < overlap/2; i++) + { + kiss_fft_scalar x1, x2; + x1 = *xp1; + x2 = *yp1; + *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1); + *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1); + wp1++; + wp2--; + } + } + RESTORE_STACK; +} #endif /* !defined(FIXED_POINT) */ diff --git a/celt/arm/fft_arm.h b/celt/arm/fft_arm.h index e7a30d69..e57b0aa6 100644 --- a/celt/arm/fft_arm.h +++ b/celt/arm/fft_arm.h @@ -46,6 +46,11 @@ void opus_fft_free_arm_float_neon(kiss_fft_state *st); void opus_fft_float_neon(const kiss_fft_state *st, const kiss_fft_cpx *fin, kiss_fft_cpx *fout); + +void opus_ifft_float_neon(const kiss_fft_state *st, + const kiss_fft_cpx *fin, + kiss_fft_cpx *fout); + #if !defined(OPUS_HAVE_RTCD) #define OVERRIDE_OPUS_FFT (1) @@ -58,6 +63,9 @@ void opus_fft_float_neon(const kiss_fft_state *st, #define opus_fft(_st, _fin, _fout, arch) \ ((void)(arch), opus_fft_float_neon(_st, _fin, _fout)) +#define opus_ifft(_st, _fin, _fout, arch) \ + ((void)(arch), opus_ifft_float_neon(_st, _fin, _fout)) + #endif /* OPUS_HAVE_RTCD */ #endif /* HAVE_ARM_NE10 */ diff --git a/celt/arm/mdct_arm.h b/celt/arm/mdct_arm.h index 7d60fedc..db32efe5 100644 --- a/celt/arm/mdct_arm.h +++ b/celt/arm/mdct_arm.h @@ -43,10 +43,17 @@ void clt_mdct_forward_float_neon(const mdct_lookup *l, kiss_fft_scalar *in, const opus_val16 *window, int overlap, int shift, int stride, int arch); +void clt_mdct_backward_float_neon(const mdct_lookup *l, kiss_fft_scalar *in, + kiss_fft_scalar * OPUS_RESTRICT out, + const opus_val16 *window, int overlap, + int shift, int stride, int arch); + #if !defined(OPUS_HAVE_RTCD) #define OVERRIDE_OPUS_MDCT (1) #define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \ clt_mdct_forward_float_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch) +#define clt_mdct_backward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \ + clt_mdct_backward_float_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch) #endif /* OPUS_HAVE_RTCD */ #endif /* !defined(FIXED_POINT) && defined(HAVE_ARM_NE10) */ -- cgit v1.2.3