diff options
author | Felicia Lim <flim@google.com> | 2017-01-25 08:49:31 -0800 |
---|---|---|
committer | Felicia Lim <flim@google.com> | 2017-01-25 08:57:38 -0800 |
commit | 0a1406acbe87c63044e9da7e0ab41bcbfa704f3d (patch) | |
tree | 6dfda4da354e420a9ac2e256f168609e7d97571a /celt | |
parent | e65278181df6dea0ac1dde71f2534d66816119d2 (diff) | |
download | libopus-0a1406acbe87c63044e9da7e0ab41bcbfa704f3d.tar.gz |
Upgrade Opus to v1.1.4android-o-preview-1android-n-mr2-preview-2o-preview
Test: - verified build for arm*/mips*/x86*
- checked functionality using an emulator and stagefright
Change-Id: Iab6e7315c51e020dc986d57c89934a1205ec7a61
Diffstat (limited to 'celt')
-rw-r--r-- | celt/arm/celt_pitch_xcorr_arm-gnu.S_gnu.s | 552 | ||||
-rw-r--r-- | celt/dump_modes/Makefile | 32 | ||||
-rw-r--r-- | celt/dump_modes/dump_modes.c | 353 | ||||
-rw-r--r-- | celt/dump_modes/dump_modes_arch.h | 45 | ||||
-rw-r--r-- | celt/dump_modes/dump_modes_arm_ne10.c | 152 | ||||
-rw-r--r-- | celt/fixed_c5x.h | 79 | ||||
-rw-r--r-- | celt/fixed_c6x.h | 70 |
7 files changed, 731 insertions, 552 deletions
diff --git a/celt/arm/celt_pitch_xcorr_arm-gnu.S_gnu.s b/celt/arm/celt_pitch_xcorr_arm-gnu.S_gnu.s deleted file mode 100644 index b62c5207..00000000 --- a/celt/arm/celt_pitch_xcorr_arm-gnu.S_gnu.s +++ /dev/null @@ -1,552 +0,0 @@ - .syntax unified - .syntax unified -,: Copyright (c) 2007-2008 CSIRO -,: Copyright (c) 2007-2009 Xiph.Org Foundation -,: Copyright (c) 2013 Parrot -,: Written by Aurélien Zanelli -,: -,: Redistribution and use in source and binary forms, with or without -,: modification, are permitted provided that the following conditions -,: are met: -,: -,: - Redistributions of source code must retain the above copyright -,: notice, this list of conditions and the following disclaimer. -,: -,: - Redistributions in binary form must reproduce the above copyright -,: notice, this list of conditions and the following disclaimer in the -,: documentation and/or other materials provided with the distribution. -,: -,: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -,: ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -,: LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -,: A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -,: OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -,: EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -,: PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES@ LOSS OF USE, @ DATA, OR -,: PROFITS@ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -,: LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -,: NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -,: SOFTWARE, EVEN .if ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - .text@ .p2align 2; .arch armv7-a - .fpu neon - .object_arch armv4t - - .include "celt/arm/armopts_gnu.s" - - .if OPUS_ARM_MAY_HAVE_EDSP - .global celt_pitch_xcorr_edsp - .endif - - .if OPUS_ARM_MAY_HAVE_NEON - .global celt_pitch_xcorr_neon - .endif - - .if OPUS_ARM_MAY_HAVE_NEON - -,: Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 -@ xcorr_kernel_neon: @ PROC -xcorr_kernel_neon_start:: - ,: input: - ,: r3 = int len - ,: r4 = opus_val16 *x - ,: r5 = opus_val16 *y - ,: q0 = opus_val32 sum[4] - ,: output: - ,: q0 = opus_val32 sum[4] - ,: preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 - ,: internal usage: - ,: r12 = int j - ,: d3 = y_3|y_2|y_1|y_0 - ,: q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 - ,: q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 - ,: q8 = scratch - ,: - ,: Load y[0...3] - ,: This requires len>0 to always be valid (which we assert in the C code). - VLD1.16 {d5}, [r5]! - SUBS r12, r3, #8 - BLE xcorr_kernel_neon_process4 -,: Process 8 samples at a time. -,: This loop loads one y value more than we actually need. Therefore we have to -,: stop as soon as there are 8 or fewer samples left (instead of 7), to avoid -,: reading past the end of the array. -xcorr_kernel_neon_process8:: - ,: This loop has 19 total instructions (10 cycles to issue, minimum), with - ,: - 2 cycles of ARM insrtuctions, - ,: - 10 cycles of load/store/byte permute instructions, and - ,: - 9 cycles of data processing instructions. - ,: On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the - ,: latter two categories, meaning the whole loop should run in 10 cycles per - ,: iteration, barring cache misses. - ,: - ,: Load x[0...7] - VLD1.16 {d6, d7}, [r4]! - ,: Unlike VMOV, VAND is a data processsing instruction (and doesn't get - ,: assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. - VAND d3, d5, d5 - SUBS r12, r12, #8 - ,: Load y[4...11] - VLD1.16 {d4, d5}, [r5]! - VMLAL.S16 q0, d3, d6[0] - VEXT.16 d16, d3, d4, #1 - VMLAL.S16 q0, d4, d7[0] - VEXT.16 d17, d4, d5, #1 - VMLAL.S16 q0, d16, d6[1] - VEXT.16 d16, d3, d4, #2 - VMLAL.S16 q0, d17, d7[1] - VEXT.16 d17, d4, d5, #2 - VMLAL.S16 q0, d16, d6[2] - VEXT.16 d16, d3, d4, #3 - VMLAL.S16 q0, d17, d7[2] - VEXT.16 d17, d4, d5, #3 - VMLAL.S16 q0, d16, d6[3] - VMLAL.S16 q0, d17, d7[3] - BGT xcorr_kernel_neon_process8 -,: Process 4 samples here if we have > 4 left (still reading one extra y value). -xcorr_kernel_neon_process4:: - ADDS r12, r12, #4 - BLE xcorr_kernel_neon_process2 - ,: Load x[0...3] - VLD1.16 d6, [r4]! - ,: Use VAND since it's a data processing instruction again. - VAND d4, d5, d5 - SUB r12, r12, #4 - ,: Load y[4...7] - VLD1.16 d5, [r5]! - VMLAL.S16 q0, d4, d6[0] - VEXT.16 d16, d4, d5, #1 - VMLAL.S16 q0, d16, d6[1] - VEXT.16 d16, d4, d5, #2 - VMLAL.S16 q0, d16, d6[2] - VEXT.16 d16, d4, d5, #3 - VMLAL.S16 q0, d16, d6[3] -,: Process 2 samples here if we have > 2 left (still reading one extra y value). -xcorr_kernel_neon_process2:: - ADDS r12, r12, #2 - BLE xcorr_kernel_neon_process1 - ,: Load x[0...1] - VLD2.16 {d6[],d7[]}, [r4]! - ,: Use VAND since it's a data processing instruction again. - VAND d4, d5, d5 - SUB r12, r12, #2 - ,: Load y[4...5] - VLD1.32 {d5[]}, [r5]! - VMLAL.S16 q0, d4, d6 - VEXT.16 d16, d4, d5, #1 - ,: Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI - ,: instead of VEXT, since it's a data-processing instruction. - VSRI.64 d5, d4, #32 - VMLAL.S16 q0, d16, d7 -,: Process 1 sample using the extra y value we loaded above. -xcorr_kernel_neon_process1:: - ,: Load next *x - VLD1.16 {d6[]}, [r4]! - ADDS r12, r12, #1 - ,: y[0...3] are left in d5 from prior iteration(s) (if any) - VMLAL.S16 q0, d5, d6 - MOVLE pc, lr -,: Now process 1 last sample, not reading ahead. - ,: Load last *y - VLD1.16 {d4[]}, [r5]! - VSRI.64 d4, d5, #16 - ,: Load last *x - VLD1.16 {d6[]}, [r4]! - VMLAL.S16 q0, d4, d6 - MOV pc, lr - .size xcorr_kernel_neon, .-xcorr_kernel_neon ,: @ ENDP - -,: opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, -,: opus_val32 *xcorr, int len, int max_pitch) -@ celt_pitch_xcorr_neon: @ PROC - ,: input: - ,: r0 = opus_val16 *_x - ,: r1 = opus_val16 *_y - ,: r2 = opus_val32 *xcorr - ,: r3 = int len - ,: output: - ,: r0 = int maxcorr - ,: internal usage: - ,: r4 = opus_val16 *x (for xcorr_kernel_neon()) - ,: r5 = opus_val16 *y (for xcorr_kernel_neon()) - ,: r6 = int max_pitch - ,: r12 = int j - ,: q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) - STMFD sp!, {r4-r6, lr} - LDR r6, [sp, #16] - VMOV.S32 q15, #1 - ,: if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done - SUBS r6, r6, #4 - BLT celt_pitch_xcorr_neon_process4_done -celt_pitch_xcorr_neon_process4:: - ,: xcorr_kernel_neon parameters: - ,: r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} - MOV r4, r0 - MOV r5, r1 - VEOR q0, q0, q0 - ,: xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. - ,: So we don't save/restore any other registers. - BL xcorr_kernel_neon_start - SUBS r6, r6, #4 - VST1.32 {q0}, [r2]! - ,: _y += 4 - ADD r1, r1, #8 - VMAX.S32 q15, q15, q0 - ,: if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done - BGE celt_pitch_xcorr_neon_process4 -,: We have less than 4 sums left to compute. -celt_pitch_xcorr_neon_process4_done:: - ADDS r6, r6, #4 - ,: Reduce maxcorr to a single value - VMAX.S32 d30, d30, d31 - VPMAX.S32 d30, d30, d30 - ,: if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done - BLE celt_pitch_xcorr_neon_done -,: Now compute each remaining sum one at a time. -celt_pitch_xcorr_neon_process_remaining:: - MOV r4, r0 - MOV r5, r1 - VMOV.I32 q0, #0 - SUBS r12, r3, #8 - BLT celt_pitch_xcorr_neon_process_remaining4 -,: Sum terms 8 at a time. -celt_pitch_xcorr_neon_process_remaining_loop8:: - ,: Load x[0...7] - VLD1.16 {q1}, [r4]! - ,: Load y[0...7] - VLD1.16 {q2}, [r5]! - SUBS r12, r12, #8 - VMLAL.S16 q0, d4, d2 - VMLAL.S16 q0, d5, d3 - BGE celt_pitch_xcorr_neon_process_remaining_loop8 -,: Sum terms 4 at a time. -celt_pitch_xcorr_neon_process_remaining4:: - ADDS r12, r12, #4 - BLT celt_pitch_xcorr_neon_process_remaining4_done - ,: Load x[0...3] - VLD1.16 {d2}, [r4]! - ,: Load y[0...3] - VLD1.16 {d3}, [r5]! - SUB r12, r12, #4 - VMLAL.S16 q0, d3, d2 -celt_pitch_xcorr_neon_process_remaining4_done:: - ,: Reduce the sum to a single value. - VADD.S32 d0, d0, d1 - VPADDL.S32 d0, d0 - ADDS r12, r12, #4 - BLE celt_pitch_xcorr_neon_process_remaining_loop_done -,: Sum terms 1 at a time. -celt_pitch_xcorr_neon_process_remaining_loop1:: - VLD1.16 {d2[]}, [r4]! - VLD1.16 {d3[]}, [r5]! - SUBS r12, r12, #1 - VMLAL.S16 q0, d2, d3 - BGT celt_pitch_xcorr_neon_process_remaining_loop1 -celt_pitch_xcorr_neon_process_remaining_loop_done:: - VST1.32 {d0[0]}, [r2]! - VMAX.S32 d30, d30, d0 - SUBS r6, r6, #1 - ,: _y++ - ADD r1, r1, #2 - ,: if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining - BGT celt_pitch_xcorr_neon_process_remaining -celt_pitch_xcorr_neon_done:: - VMOV.32 r0, d30[0] - LDMFD sp!, {r4-r6, pc} - .size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon ,: @ ENDP - - .endif - - .if OPUS_ARM_MAY_HAVE_EDSP - -,: This will get used on ARMv7 devices without NEON, so it has been optimized -,: to take advantage of dual-issuing where possible. -@ xcorr_kernel_edsp: @ PROC -xcorr_kernel_edsp_start:: - ,: input: - ,: r3 = int len - ,: r4 = opus_val16 *_x (must be 32-bit aligned) - ,: r5 = opus_val16 *_y (must be 32-bit aligned) - ,: r6...r9 = opus_val32 sum[4] - ,: output: - ,: r6...r9 = opus_val32 sum[4] - ,: preserved: r0-r5 - ,: internal usage - ,: r2 = int j - ,: r12,r14 = opus_val16 x[4] - ,: r10,r11 = opus_val16 y[4] - STMFD sp!, {r2,r4,r5,lr} - LDR r10, [r5], #4 ,: Load y[0...1] - SUBS r2, r3, #4 ,: j = len-4 - LDR r11, [r5], #4 ,: Load y[2...3] - BLE xcorr_kernel_edsp_process4_done - LDR r12, [r4], #4 ,: Load x[0...1] - ,: Stall -xcorr_kernel_edsp_process4:: - ,: The multiplies must issue from pipeline 0, and can't dual-issue with each - ,: other. Every other instruction here dual-issues with a multiply, and is - ,: thus "free". There should be no stalls in the body of the loop. - SMLABB r6, r12, r10, r6 ,: sum[0] = MAC16_16(sum[0],x_0,y_0) - LDR r14, [r4], #4 ,: Load x[2...3] - SMLABT r7, r12, r10, r7 ,: sum[1] = MAC16_16(sum[1],x_0,y_1) - SUBS r2, r2, #4 ,: j-=4 - SMLABB r8, r12, r11, r8 ,: sum[2] = MAC16_16(sum[2],x_0,y_2) - SMLABT r9, r12, r11, r9 ,: sum[3] = MAC16_16(sum[3],x_0,y_3) - SMLATT r6, r12, r10, r6 ,: sum[0] = MAC16_16(sum[0],x_1,y_1) - LDR r10, [r5], #4 ,: Load y[4...5] - SMLATB r7, r12, r11, r7 ,: sum[1] = MAC16_16(sum[1],x_1,y_2) - SMLATT r8, r12, r11, r8 ,: sum[2] = MAC16_16(sum[2],x_1,y_3) - SMLATB r9, r12, r10, r9 ,: sum[3] = MAC16_16(sum[3],x_1,y_4) - LDRGT r12, [r4], #4 ,: Load x[0...1] - SMLABB r6, r14, r11, r6 ,: sum[0] = MAC16_16(sum[0],x_2,y_2) - SMLABT r7, r14, r11, r7 ,: sum[1] = MAC16_16(sum[1],x_2,y_3) - SMLABB r8, r14, r10, r8 ,: sum[2] = MAC16_16(sum[2],x_2,y_4) - SMLABT r9, r14, r10, r9 ,: sum[3] = MAC16_16(sum[3],x_2,y_5) - SMLATT r6, r14, r11, r6 ,: sum[0] = MAC16_16(sum[0],x_3,y_3) - LDR r11, [r5], #4 ,: Load y[6...7] - SMLATB r7, r14, r10, r7 ,: sum[1] = MAC16_16(sum[1],x_3,y_4) - SMLATT r8, r14, r10, r8 ,: sum[2] = MAC16_16(sum[2],x_3,y_5) - SMLATB r9, r14, r11, r9 ,: sum[3] = MAC16_16(sum[3],x_3,y_6) - BGT xcorr_kernel_edsp_process4 -xcorr_kernel_edsp_process4_done:: - ADDS r2, r2, #4 - BLE xcorr_kernel_edsp_done - LDRH r12, [r4], #2 ,: r12 = *x++ - SUBS r2, r2, #1 ,: j-- - ,: Stall - SMLABB r6, r12, r10, r6 ,: sum[0] = MAC16_16(sum[0],x,y_0) - LDRHGT r14, [r4], #2 ,: r14 = *x++ - SMLABT r7, r12, r10, r7 ,: sum[1] = MAC16_16(sum[1],x,y_1) - SMLABB r8, r12, r11, r8 ,: sum[2] = MAC16_16(sum[2],x,y_2) - SMLABT r9, r12, r11, r9 ,: sum[3] = MAC16_16(sum[3],x,y_3) - BLE xcorr_kernel_edsp_done - SMLABT r6, r14, r10, r6 ,: sum[0] = MAC16_16(sum[0],x,y_1) - SUBS r2, r2, #1 ,: j-- - SMLABB r7, r14, r11, r7 ,: sum[1] = MAC16_16(sum[1],x,y_2) - LDRH r10, [r5], #2 ,: r10 = y_4 = *y++ - SMLABT r8, r14, r11, r8 ,: sum[2] = MAC16_16(sum[2],x,y_3) - LDRHGT r12, [r4], #2 ,: r12 = *x++ - SMLABB r9, r14, r10, r9 ,: sum[3] = MAC16_16(sum[3],x,y_4) - BLE xcorr_kernel_edsp_done - SMLABB r6, r12, r11, r6 ,: sum[0] = MAC16_16(sum[0],tmp,y_2) - CMP r2, #1 ,: j-- - SMLABT r7, r12, r11, r7 ,: sum[1] = MAC16_16(sum[1],tmp,y_3) - LDRH r2, [r5], #2 ,: r2 = y_5 = *y++ - SMLABB r8, r12, r10, r8 ,: sum[2] = MAC16_16(sum[2],tmp,y_4) - LDRHGT r14, [r4] ,: r14 = *x - SMLABB r9, r12, r2, r9 ,: sum[3] = MAC16_16(sum[3],tmp,y_5) - BLE xcorr_kernel_edsp_done - SMLABT r6, r14, r11, r6 ,: sum[0] = MAC16_16(sum[0],tmp,y_3) - LDRH r11, [r5] ,: r11 = y_6 = *y - SMLABB r7, r14, r10, r7 ,: sum[1] = MAC16_16(sum[1],tmp,y_4) - SMLABB r8, r14, r2, r8 ,: sum[2] = MAC16_16(sum[2],tmp,y_5) - SMLABB r9, r14, r11, r9 ,: sum[3] = MAC16_16(sum[3],tmp,y_6) -xcorr_kernel_edsp_done:: - LDMFD sp!, {r2,r4,r5,pc} - .size xcorr_kernel_edsp, .-xcorr_kernel_edsp ,: @ ENDP - -@ celt_pitch_xcorr_edsp: @ PROC - ,: input: - ,: r0 = opus_val16 *_x (must be 32-bit aligned) - ,: r1 = opus_val16 *_y (only needs to be 16-bit aligned) - ,: r2 = opus_val32 *xcorr - ,: r3 = int len - ,: output: - ,: r0 = maxcorr - ,: internal usage - ,: r4 = opus_val16 *x - ,: r5 = opus_val16 *y - ,: r6 = opus_val32 sum0 - ,: r7 = opus_val32 sum1 - ,: r8 = opus_val32 sum2 - ,: r9 = opus_val32 sum3 - ,: r1 = int max_pitch - ,: r12 = int j - STMFD sp!, {r4-r11, lr} - MOV r5, r1 - LDR r1, [sp, #36] - MOV r4, r0 - TST r5, #3 - ,: maxcorr = 1 - MOV r0, #1 - BEQ celt_pitch_xcorr_edsp_process1u_done -,: Compute one sum at the start to make y 32-bit aligned. - SUBS r12, r3, #4 - ,: r14 = sum = 0 - MOV r14, #0 - LDRH r8, [r5], #2 - BLE celt_pitch_xcorr_edsp_process1u_loop4_done - LDR r6, [r4], #4 - MOV r8, r8, LSL #16 -celt_pitch_xcorr_edsp_process1u_loop4:: - LDR r9, [r5], #4 - SMLABT r14, r6, r8, r14 ,: sum = MAC16_16(sum, x_0, y_0) - LDR r7, [r4], #4 - SMLATB r14, r6, r9, r14 ,: sum = MAC16_16(sum, x_1, y_1) - LDR r8, [r5], #4 - SMLABT r14, r7, r9, r14 ,: sum = MAC16_16(sum, x_2, y_2) - SUBS r12, r12, #4 ,: j-=4 - SMLATB r14, r7, r8, r14 ,: sum = MAC16_16(sum, x_3, y_3) - LDRGT r6, [r4], #4 - BGT celt_pitch_xcorr_edsp_process1u_loop4 - MOV r8, r8, LSR #16 -celt_pitch_xcorr_edsp_process1u_loop4_done:: - ADDS r12, r12, #4 -celt_pitch_xcorr_edsp_process1u_loop1:: - LDRHGE r6, [r4], #2 - ,: Stall - SMLABBGE r14, r6, r8, r14 ,: sum = MAC16_16(sum, *x, *y) - SUBSGE r12, r12, #1 - LDRHGT r8, [r5], #2 - BGT celt_pitch_xcorr_edsp_process1u_loop1 - ,: Restore _x - SUB r4, r4, r3, LSL #1 - ,: Restore and advance _y - SUB r5, r5, r3, LSL #1 - ,: maxcorr = max(maxcorr, sum) - CMP r0, r14 - ADD r5, r5, #2 - MOVLT r0, r14 - SUBS r1, r1, #1 - ,: xcorr[i] = sum - STR r14, [r2], #4 - BLE celt_pitch_xcorr_edsp_done -celt_pitch_xcorr_edsp_process1u_done:: - ,: if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2 - SUBS r1, r1, #4 - BLT celt_pitch_xcorr_edsp_process2 -celt_pitch_xcorr_edsp_process4:: - ,: xcorr_kernel_edsp parameters: - ,: r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} - MOV r6, #0 - MOV r7, #0 - MOV r8, #0 - MOV r9, #0 - BL xcorr_kernel_edsp_start ,: xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) - ,: maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) - CMP r0, r6 - ,: _y+=4 - ADD r5, r5, #8 - MOVLT r0, r6 - CMP r0, r7 - MOVLT r0, r7 - CMP r0, r8 - MOVLT r0, r8 - CMP r0, r9 - MOVLT r0, r9 - STMIA r2!, {r6-r9} - SUBS r1, r1, #4 - BGE celt_pitch_xcorr_edsp_process4 -celt_pitch_xcorr_edsp_process2:: - ADDS r1, r1, #2 - BLT celt_pitch_xcorr_edsp_process1a - SUBS r12, r3, #4 - ,: {r10, r11} = {sum0, sum1} = {0, 0} - MOV r10, #0 - MOV r11, #0 - LDR r8, [r5], #4 - BLE celt_pitch_xcorr_edsp_process2_loop_done - LDR r6, [r4], #4 - LDR r9, [r5], #4 -celt_pitch_xcorr_edsp_process2_loop4:: - SMLABB r10, r6, r8, r10 ,: sum0 = MAC16_16(sum0, x_0, y_0) - LDR r7, [r4], #4 - SMLABT r11, r6, r8, r11 ,: sum1 = MAC16_16(sum1, x_0, y_1) - SUBS r12, r12, #4 ,: j-=4 - SMLATT r10, r6, r8, r10 ,: sum0 = MAC16_16(sum0, x_1, y_1) - LDR r8, [r5], #4 - SMLATB r11, r6, r9, r11 ,: sum1 = MAC16_16(sum1, x_1, y_2) - LDRGT r6, [r4], #4 - SMLABB r10, r7, r9, r10 ,: sum0 = MAC16_16(sum0, x_2, y_2) - SMLABT r11, r7, r9, r11 ,: sum1 = MAC16_16(sum1, x_2, y_3) - SMLATT r10, r7, r9, r10 ,: sum0 = MAC16_16(sum0, x_3, y_3) - LDRGT r9, [r5], #4 - SMLATB r11, r7, r8, r11 ,: sum1 = MAC16_16(sum1, x_3, y_4) - BGT celt_pitch_xcorr_edsp_process2_loop4 -celt_pitch_xcorr_edsp_process2_loop_done:: - ADDS r12, r12, #2 - BLE celt_pitch_xcorr_edsp_process2_1 - LDR r6, [r4], #4 - ,: Stall - SMLABB r10, r6, r8, r10 ,: sum0 = MAC16_16(sum0, x_0, y_0) - LDR r9, [r5], #4 - SMLABT r11, r6, r8, r11 ,: sum1 = MAC16_16(sum1, x_0, y_1) - SUB r12, r12, #2 - SMLATT r10, r6, r8, r10 ,: sum0 = MAC16_16(sum0, x_1, y_1) - MOV r8, r9 - SMLATB r11, r6, r9, r11 ,: sum1 = MAC16_16(sum1, x_1, y_2) -celt_pitch_xcorr_edsp_process2_1:: - LDRH r6, [r4], #2 - ADDS r12, r12, #1 - ,: Stall - SMLABB r10, r6, r8, r10 ,: sum0 = MAC16_16(sum0, x_0, y_0) - LDRHGT r7, [r4], #2 - SMLABT r11, r6, r8, r11 ,: sum1 = MAC16_16(sum1, x_0, y_1) - BLE celt_pitch_xcorr_edsp_process2_done - LDRH r9, [r5], #2 - SMLABT r10, r7, r8, r10 ,: sum0 = MAC16_16(sum0, x_0, y_1) - SMLABB r11, r7, r9, r11 ,: sum1 = MAC16_16(sum1, x_0, y_2) -celt_pitch_xcorr_edsp_process2_done:: - ,: Restore _x - SUB r4, r4, r3, LSL #1 - ,: Restore and advance _y - SUB r5, r5, r3, LSL #1 - ,: maxcorr = max(maxcorr, sum0) - CMP r0, r10 - ADD r5, r5, #2 - MOVLT r0, r10 - SUB r1, r1, #2 - ,: maxcorr = max(maxcorr, sum1) - CMP r0, r11 - ,: xcorr[i] = sum - STR r10, [r2], #4 - MOVLT r0, r11 - STR r11, [r2], #4 -celt_pitch_xcorr_edsp_process1a:: - ADDS r1, r1, #1 - BLT celt_pitch_xcorr_edsp_done - SUBS r12, r3, #4 - ,: r14 = sum = 0 - MOV r14, #0 - BLT celt_pitch_xcorr_edsp_process1a_loop_done - LDR r6, [r4], #4 - LDR r8, [r5], #4 - LDR r7, [r4], #4 - LDR r9, [r5], #4 -celt_pitch_xcorr_edsp_process1a_loop4:: - SMLABB r14, r6, r8, r14 ,: sum = MAC16_16(sum, x_0, y_0) - SUBS r12, r12, #4 ,: j-=4 - SMLATT r14, r6, r8, r14 ,: sum = MAC16_16(sum, x_1, y_1) - LDRGE r6, [r4], #4 - SMLABB r14, r7, r9, r14 ,: sum = MAC16_16(sum, x_2, y_2) - LDRGE r8, [r5], #4 - SMLATT r14, r7, r9, r14 ,: sum = MAC16_16(sum, x_3, y_3) - LDRGE r7, [r4], #4 - LDRGE r9, [r5], #4 - BGE celt_pitch_xcorr_edsp_process1a_loop4 -celt_pitch_xcorr_edsp_process1a_loop_done:: - ADDS r12, r12, #2 - LDRGE r6, [r4], #4 - LDRGE r8, [r5], #4 - ,: Stall - SMLABBGE r14, r6, r8, r14 ,: sum = MAC16_16(sum, x_0, y_0) - SUBGE r12, r12, #2 - SMLATTGE r14, r6, r8, r14 ,: sum = MAC16_16(sum, x_1, y_1) - ADDS r12, r12, #1 - LDRHGE r6, [r4], #2 - LDRHGE r8, [r5], #2 - ,: Stall - SMLABBGE r14, r6, r8, r14 ,: sum = MAC16_16(sum, *x, *y) - ,: maxcorr = max(maxcorr, sum) - CMP r0, r14 - ,: xcorr[i] = sum - STR r14, [r2], #4 - MOVLT r0, r14 -celt_pitch_xcorr_edsp_done:: - LDMFD sp!, {r4-r11, pc} - .size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp ,: @ ENDP - - .endif - -,: @ END: - .section .note.GNU-stack,"",%progbits diff --git a/celt/dump_modes/Makefile b/celt/dump_modes/Makefile new file mode 100644 index 00000000..93f599fb --- /dev/null +++ b/celt/dump_modes/Makefile @@ -0,0 +1,32 @@ + +CFLAGS=-O2 -Wall -Wextra -DHAVE_CONFIG_H +INCLUDES=-I. -I../ -I../.. -I../../include + +SOURCES = dump_modes.c \ + ../modes.c \ + ../cwrs.c \ + ../rate.c \ + ../entcode.c \ + ../entenc.c \ + ../entdec.c \ + ../mathops.c \ + ../mdct.c \ + ../kiss_fft.c + +ifdef HAVE_ARM_NE10 +CC = gcc +CFLAGS += -mfpu=neon +INCLUDES += -I$(NE10_INCDIR) -DHAVE_ARM_NE10 -DOPUS_ARM_PRESUME_NEON_INTR +LIBS = -L$(NE10_LIBDIR) -lNE10 +SOURCES += ../arm/celt_ne10_fft.c \ + dump_modes_arm_ne10.c \ + ../arm/armcpu.c +endif + +all: dump_modes + +dump_modes: + $(PREFIX)$(CC) $(CFLAGS) $(INCLUDES) -DCUSTOM_MODES_ONLY -DCUSTOM_MODES $(SOURCES) -o $@ $(LIBS) -lm + +clean: + rm -f dump_modes diff --git a/celt/dump_modes/dump_modes.c b/celt/dump_modes/dump_modes.c new file mode 100644 index 00000000..9105a534 --- /dev/null +++ b/celt/dump_modes/dump_modes.c @@ -0,0 +1,353 @@ +/* Copyright (c) 2008 CSIRO + Copyright (c) 2008-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdlib.h> +#include <stdio.h> +#include "modes.h" +#include "celt.h" +#include "rate.h" +#include "dump_modes_arch.h" + +#define INT16 "%d" +#define INT32 "%d" +#define FLOAT "%#0.8gf" + +#ifdef FIXED_POINT +#define WORD16 INT16 +#define WORD32 INT32 +#else +#define WORD16 FLOAT +#define WORD32 FLOAT +#endif + +void dump_modes(FILE *file, CELTMode **modes, int nb_modes) +{ + int i, j, k; + int mdct_twiddles_size; + fprintf(file, "/* The contents of this file was automatically generated by dump_modes.c\n"); + fprintf(file, " with arguments:"); + for (i=0;i<nb_modes;i++) + { + CELTMode *mode = modes[i]; + fprintf(file, " %d %d",mode->Fs,mode->shortMdctSize*mode->nbShortMdcts); + } + fprintf(file, "\n It contains static definitions for some pre-defined modes. */\n"); + fprintf(file, "#include \"modes.h\"\n"); + fprintf(file, "#include \"rate.h\"\n"); + fprintf(file, "\n#ifdef HAVE_ARM_NE10\n"); + fprintf(file, "#define OVERRIDE_FFT 1\n"); + fprintf(file, "#include \"%s\"\n", ARM_NE10_ARCH_FILE_NAME); + fprintf(file, "#endif\n"); + + fprintf(file, "\n"); + + for (i=0;i<nb_modes;i++) + { + CELTMode *mode = modes[i]; + int mdctSize; + int standard, framerate; + + mdctSize = mode->shortMdctSize*mode->nbShortMdcts; + standard = (mode->Fs == 400*(opus_int32)mode->shortMdctSize); + framerate = mode->Fs/mode->shortMdctSize; + + if (!standard) + { + fprintf(file, "#ifndef DEF_EBANDS%d_%d\n", mode->Fs, mdctSize); + fprintf(file, "#define DEF_EBANDS%d_%d\n", mode->Fs, mdctSize); + fprintf (file, "static const opus_int16 eBands%d_%d[%d] = {\n", mode->Fs, mdctSize, mode->nbEBands+2); + for (j=0;j<mode->nbEBands+2;j++) + fprintf (file, "%d, ", mode->eBands[j]); + fprintf (file, "};\n"); + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + } + + fprintf(file, "#ifndef DEF_WINDOW%d\n", mode->overlap); + fprintf(file, "#define DEF_WINDOW%d\n", mode->overlap); + fprintf (file, "static const opus_val16 window%d[%d] = {\n", mode->overlap, mode->overlap); + for (j=0;j<mode->overlap;j++) + fprintf (file, WORD16 ",%c", mode->window[j],(j+6)%5==0?'\n':' '); + fprintf (file, "};\n"); + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + + if (!standard) + { + fprintf(file, "#ifndef DEF_ALLOC_VECTORS%d_%d\n", mode->Fs, mdctSize); + fprintf(file, "#define DEF_ALLOC_VECTORS%d_%d\n", mode->Fs, mdctSize); + fprintf (file, "static const unsigned char allocVectors%d_%d[%d] = {\n", mode->Fs, mdctSize, mode->nbEBands*mode->nbAllocVectors); + for (j=0;j<mode->nbAllocVectors;j++) + { + for (k=0;k<mode->nbEBands;k++) + fprintf (file, "%2d, ", mode->allocVectors[j*mode->nbEBands+k]); + fprintf (file, "\n"); + } + fprintf (file, "};\n"); + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + } + + fprintf(file, "#ifndef DEF_LOGN%d\n", framerate); + fprintf(file, "#define DEF_LOGN%d\n", framerate); + fprintf (file, "static const opus_int16 logN%d[%d] = {\n", framerate, mode->nbEBands); + for (j=0;j<mode->nbEBands;j++) + fprintf (file, "%d, ", mode->logN[j]); + fprintf (file, "};\n"); + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + + /* Pulse cache */ + fprintf(file, "#ifndef DEF_PULSE_CACHE%d\n", mode->Fs/mdctSize); + fprintf(file, "#define DEF_PULSE_CACHE%d\n", mode->Fs/mdctSize); + fprintf (file, "static const opus_int16 cache_index%d[%d] = {\n", mode->Fs/mdctSize, (mode->maxLM+2)*mode->nbEBands); + for (j=0;j<mode->nbEBands*(mode->maxLM+2);j++) + fprintf (file, "%d,%c", mode->cache.index[j],(j+16)%15==0?'\n':' '); + fprintf (file, "};\n"); + fprintf (file, "static const unsigned char cache_bits%d[%d] = {\n", mode->Fs/mdctSize, mode->cache.size); + for (j=0;j<mode->cache.size;j++) + fprintf (file, "%d,%c", mode->cache.bits[j],(j+16)%15==0?'\n':' '); + fprintf (file, "};\n"); + fprintf (file, "static const unsigned char cache_caps%d[%d] = {\n", mode->Fs/mdctSize, (mode->maxLM+1)*2*mode->nbEBands); + for (j=0;j<(mode->maxLM+1)*2*mode->nbEBands;j++) + fprintf (file, "%d,%c", mode->cache.caps[j],(j+16)%15==0?'\n':' '); + fprintf (file, "};\n"); + + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + + /* FFT twiddles */ + fprintf(file, "#ifndef FFT_TWIDDLES%d_%d\n", mode->Fs, mdctSize); + fprintf(file, "#define FFT_TWIDDLES%d_%d\n", mode->Fs, mdctSize); + fprintf (file, "static const kiss_twiddle_cpx fft_twiddles%d_%d[%d] = {\n", + mode->Fs, mdctSize, mode->mdct.kfft[0]->nfft); + for (j=0;j<mode->mdct.kfft[0]->nfft;j++) + fprintf (file, "{" WORD16 ", " WORD16 "},%c", mode->mdct.kfft[0]->twiddles[j].r, mode->mdct.kfft[0]->twiddles[j].i,(j+3)%2==0?'\n':' '); + fprintf (file, "};\n"); + +#ifdef OVERRIDE_FFT + dump_mode_arch(mode); +#endif + /* FFT Bitrev tables */ + for (k=0;k<=mode->mdct.maxshift;k++) + { + fprintf(file, "#ifndef FFT_BITREV%d\n", mode->mdct.kfft[k]->nfft); + fprintf(file, "#define FFT_BITREV%d\n", mode->mdct.kfft[k]->nfft); + fprintf (file, "static const opus_int16 fft_bitrev%d[%d] = {\n", + mode->mdct.kfft[k]->nfft, mode->mdct.kfft[k]->nfft); + for (j=0;j<mode->mdct.kfft[k]->nfft;j++) + fprintf (file, "%d,%c", mode->mdct.kfft[k]->bitrev[j],(j+16)%15==0?'\n':' '); + fprintf (file, "};\n"); + + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + } + + /* FFT States */ + for (k=0;k<=mode->mdct.maxshift;k++) + { + fprintf(file, "#ifndef FFT_STATE%d_%d_%d\n", mode->Fs, mdctSize, k); + fprintf(file, "#define FFT_STATE%d_%d_%d\n", mode->Fs, mdctSize, k); + fprintf (file, "static const kiss_fft_state fft_state%d_%d_%d = {\n", + mode->Fs, mdctSize, k); + fprintf (file, "%d, /* nfft */\n", mode->mdct.kfft[k]->nfft); + fprintf (file, WORD16 ", /* scale */\n", mode->mdct.kfft[k]->scale); +#ifdef FIXED_POINT + fprintf (file, "%d, /* scale_shift */\n", mode->mdct.kfft[k]->scale_shift); +#endif + fprintf (file, "%d, /* shift */\n", mode->mdct.kfft[k]->shift); + fprintf (file, "{"); + for (j=0;j<2*MAXFACTORS;j++) + fprintf (file, "%d, ", mode->mdct.kfft[k]->factors[j]); + fprintf (file, "}, /* factors */\n"); + fprintf (file, "fft_bitrev%d, /* bitrev */\n", mode->mdct.kfft[k]->nfft); + fprintf (file, "fft_twiddles%d_%d, /* bitrev */\n", mode->Fs, mdctSize); + + fprintf (file, "#ifdef OVERRIDE_FFT\n"); + fprintf (file, "(arch_fft_state *)&cfg_arch_%d,\n", mode->mdct.kfft[k]->nfft); + fprintf (file, "#else\n"); + fprintf (file, "NULL,\n"); + fprintf(file, "#endif\n"); + + fprintf (file, "};\n"); + + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + } + + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + + /* MDCT twiddles */ + mdct_twiddles_size = mode->mdct.n-(mode->mdct.n/2>>mode->mdct.maxshift); + fprintf(file, "#ifndef MDCT_TWIDDLES%d\n", mdctSize); + fprintf(file, "#define MDCT_TWIDDLES%d\n", mdctSize); + fprintf (file, "static const opus_val16 mdct_twiddles%d[%d] = {\n", + mdctSize, mdct_twiddles_size); + for (j=0;j<mdct_twiddles_size;j++) + fprintf (file, WORD16 ",%c", mode->mdct.trig[j],(j+6)%5==0?'\n':' '); + fprintf (file, "};\n"); + + fprintf(file, "#endif\n"); + fprintf(file, "\n"); + + + /* Print the actual mode data */ + fprintf(file, "static const CELTMode mode%d_%d_%d = {\n", mode->Fs, mdctSize, mode->overlap); + fprintf(file, INT32 ", /* Fs */\n", mode->Fs); + fprintf(file, "%d, /* overlap */\n", mode->overlap); + fprintf(file, "%d, /* nbEBands */\n", mode->nbEBands); + fprintf(file, "%d, /* effEBands */\n", mode->effEBands); + fprintf(file, "{"); + for (j=0;j<4;j++) + fprintf(file, WORD16 ", ", mode->preemph[j]); + fprintf(file, "}, /* preemph */\n"); + if (standard) + fprintf(file, "eband5ms, /* eBands */\n"); + else + fprintf(file, "eBands%d_%d, /* eBands */\n", mode->Fs, mdctSize); + + fprintf(file, "%d, /* maxLM */\n", mode->maxLM); + fprintf(file, "%d, /* nbShortMdcts */\n", mode->nbShortMdcts); + fprintf(file, "%d, /* shortMdctSize */\n", mode->shortMdctSize); + + fprintf(file, "%d, /* nbAllocVectors */\n", mode->nbAllocVectors); + if (standard) + fprintf(file, "band_allocation, /* allocVectors */\n"); + else + fprintf(file, "allocVectors%d_%d, /* allocVectors */\n", mode->Fs, mdctSize); + + fprintf(file, "logN%d, /* logN */\n", framerate); + fprintf(file, "window%d, /* window */\n", mode->overlap); + fprintf(file, "{%d, %d, {", mode->mdct.n, mode->mdct.maxshift); + for (k=0;k<=mode->mdct.maxshift;k++) + fprintf(file, "&fft_state%d_%d_%d, ", mode->Fs, mdctSize, k); + fprintf (file, "}, mdct_twiddles%d}, /* mdct */\n", mdctSize); + + fprintf(file, "{%d, cache_index%d, cache_bits%d, cache_caps%d}, /* cache */\n", + mode->cache.size, mode->Fs/mdctSize, mode->Fs/mdctSize, mode->Fs/mdctSize); + fprintf(file, "};\n"); + } + fprintf(file, "\n"); + fprintf(file, "/* List of all the available modes */\n"); + fprintf(file, "#define TOTAL_MODES %d\n", nb_modes); + fprintf(file, "static const CELTMode * const static_mode_list[TOTAL_MODES] = {\n"); + for (i=0;i<nb_modes;i++) + { + CELTMode *mode = modes[i]; + int mdctSize; + mdctSize = mode->shortMdctSize*mode->nbShortMdcts; + fprintf(file, "&mode%d_%d_%d,\n", mode->Fs, mdctSize, mode->overlap); + } + fprintf(file, "};\n"); +} + +void dump_header(FILE *file, CELTMode **modes, int nb_modes) +{ + int i; + int channels = 0; + int frame_size = 0; + int overlap = 0; + fprintf (file, "/* This header file is generated automatically*/\n"); + for (i=0;i<nb_modes;i++) + { + CELTMode *mode = modes[i]; + if (frame_size==0) + frame_size = mode->shortMdctSize*mode->nbShortMdcts; + else if (frame_size != mode->shortMdctSize*mode->nbShortMdcts) + frame_size = -1; + if (overlap==0) + overlap = mode->overlap; + else if (overlap != mode->overlap) + overlap = -1; + } + if (channels>0) + { + fprintf (file, "#define CHANNELS(mode) %d\n", channels); + if (channels==1) + fprintf (file, "#define DISABLE_STEREO\n"); + } + if (frame_size>0) + { + fprintf (file, "#define FRAMESIZE(mode) %d\n", frame_size); + } + if (overlap>0) + { + fprintf (file, "#define OVERLAP(mode) %d\n", overlap); + } +} + +#ifdef FIXED_POINT +#define BASENAME "static_modes_fixed" +#else +#define BASENAME "static_modes_float" +#endif + +int main(int argc, char **argv) +{ + int i, nb; + FILE *file; + CELTMode **m; + if (argc%2 != 1 || argc<3) + { + fprintf (stderr, "Usage: %s rate frame_size [rate frame_size] [rate frame_size]...\n",argv[0]); + return 1; + } + nb = (argc-1)/2; + m = malloc(nb*sizeof(CELTMode*)); + for (i=0;i<nb;i++) + { + int Fs, frame; + Fs = atoi(argv[2*i+1]); + frame = atoi(argv[2*i+2]); + m[i] = opus_custom_mode_create(Fs, frame, NULL); + if (m[i]==NULL) + { + fprintf(stderr,"Error creating mode with Fs=%s, frame_size=%s\n", + argv[2*i+1],argv[2*i+2]); + return EXIT_FAILURE; + } + } + file = fopen(BASENAME ".h", "w"); +#ifdef OVERRIDE_FFT + dump_modes_arch_init(m, nb); +#endif + dump_modes(file, m, nb); + fclose(file); +#ifdef OVERRIDE_FFT + dump_modes_arch_finalize(); +#endif + for (i=0;i<nb;i++) + opus_custom_mode_destroy(m[i]); + free(m); + return 0; +} diff --git a/celt/dump_modes/dump_modes_arch.h b/celt/dump_modes/dump_modes_arch.h new file mode 100644 index 00000000..cc0d4be1 --- /dev/null +++ b/celt/dump_modes/dump_modes_arch.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2015 Xiph.Org Foundation + Written by Viswanath Puttagunta */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef DUMP_MODE_ARCH_H +#define DUMP_MODE_ARCH_H + +void dump_modes_arch_init(); +void dump_mode_arch(CELTMode *mode); +void dump_modes_arch_finalize(); + +#if !defined(FIXED_POINT) +#define ARM_NE10_ARCH_FILE_NAME "static_modes_float_arm_ne10.h" +#else +#define ARM_NE10_ARCH_FILE_NAME "static_modes_fixed_arm_ne10.h" +#endif + +#if defined(HAVE_ARM_NE10) +#define OVERRIDE_FFT (1) +#endif + +#endif diff --git a/celt/dump_modes/dump_modes_arm_ne10.c b/celt/dump_modes/dump_modes_arm_ne10.c new file mode 100644 index 00000000..47578cda --- /dev/null +++ b/celt/dump_modes/dump_modes_arm_ne10.c @@ -0,0 +1,152 @@ +/* Copyright (c) 2015 Xiph.Org Foundation + Written by Viswanath Puttagunta */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if defined(HAVE_CONFIG_H) +# include "config.h" +#endif + +#include <stdio.h> +#include <stdlib.h> +#include "modes.h" +#include "dump_modes_arch.h" +#include <NE10_dsp.h> + +#if !defined(FIXED_POINT) +# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t +# define NE10_FFT_CPX_TYPE_T_STR "ne10_fft_cpx_float32_t" +# define NE10_FFT_STATE_TYPE_T_STR "ne10_fft_state_float32_t" +#else +# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t +# define NE10_FFT_CPX_TYPE_T_STR "ne10_fft_cpx_int32_t" +# define NE10_FFT_STATE_TYPE_T_STR "ne10_fft_state_int32_t" +#endif + +static FILE *file; + +void dump_modes_arch_init(CELTMode **modes, int nb_modes) +{ + int i; + + file = fopen(ARM_NE10_ARCH_FILE_NAME, "w"); + fprintf(file, "/* The contents of this file was automatically generated by\n"); + fprintf(file, " * dump_mode_arm_ne10.c with arguments:"); + for (i=0;i<nb_modes;i++) + { + CELTMode *mode = modes[i]; + fprintf(file, " %d %d",mode->Fs,mode->shortMdctSize*mode->nbShortMdcts); + } + fprintf(file, "\n * It contains static definitions for some pre-defined modes. */\n"); + fprintf(file, "#include <NE10_init.h>\n\n"); +} + +void dump_modes_arch_finalize() +{ + fclose(file); +} + +void dump_mode_arch(CELTMode *mode) +{ + int k, j; + int mdctSize; + + mdctSize = mode->shortMdctSize*mode->nbShortMdcts; + + fprintf(file, "#ifndef NE10_FFT_PARAMS%d_%d\n", mode->Fs, mdctSize); + fprintf(file, "#define NE10_FFT_PARAMS%d_%d\n", mode->Fs, mdctSize); + /* cfg->factors */ + for(k=0;k<=mode->mdct.maxshift;k++) { + NE10_FFT_CFG_TYPE_T cfg; + cfg = (NE10_FFT_CFG_TYPE_T)mode->mdct.kfft[k]->arch_fft->priv; + if (!cfg) + continue; + fprintf(file, "static const ne10_int32_t ne10_factors_%d[%d] = {\n", + mode->mdct.kfft[k]->nfft, (NE10_MAXFACTORS * 2)); + for(j=0;j<(NE10_MAXFACTORS * 2);j++) { + fprintf(file, "%d,%c", cfg->factors[j],(j+16)%15==0?'\n':' '); + } + fprintf (file, "};\n"); + } + + /* cfg->twiddles */ + for(k=0;k<=mode->mdct.maxshift;k++) { + NE10_FFT_CFG_TYPE_T cfg; + cfg = (NE10_FFT_CFG_TYPE_T)mode->mdct.kfft[k]->arch_fft->priv; + if (!cfg) + continue; + fprintf(file, "static const %s ne10_twiddles_%d[%d] = {\n", + NE10_FFT_CPX_TYPE_T_STR, mode->mdct.kfft[k]->nfft, + mode->mdct.kfft[k]->nfft); + for(j=0;j<mode->mdct.kfft[k]->nfft;j++) { +#if !defined(FIXED_POINT) + fprintf(file, "{%#0.8gf,%#0.8gf},%c", + cfg->twiddles[j].r, cfg->twiddles[j].i,(j+4)%3==0?'\n':' '); +#else + fprintf(file, "{%d,%d},%c", + cfg->twiddles[j].r, cfg->twiddles[j].i,(j+4)%3==0?'\n':' '); +#endif + } + fprintf (file, "};\n"); + } + + for(k=0;k<=mode->mdct.maxshift;k++) { + NE10_FFT_CFG_TYPE_T cfg; + cfg = (NE10_FFT_CFG_TYPE_T)mode->mdct.kfft[k]->arch_fft->priv; + if (!cfg) { + fprintf(file, "/* Ne10 does not support scaled FFT for length = %d */\n", + mode->mdct.kfft[k]->nfft); + fprintf(file, "static const arch_fft_state cfg_arch_%d = {\n", mode->mdct.kfft[k]->nfft); + fprintf(file, "0,\n"); + fprintf(file, "NULL\n"); + fprintf(file, "};\n"); + continue; + } + fprintf(file, "static const %s %s_%d = {\n", NE10_FFT_STATE_TYPE_T_STR, + NE10_FFT_STATE_TYPE_T_STR, mode->mdct.kfft[k]->nfft); + fprintf(file, "%d,\n", cfg->nfft); + fprintf(file, "(ne10_int32_t *)ne10_factors_%d,\n", mode->mdct.kfft[k]->nfft); + fprintf(file, "(%s *)ne10_twiddles_%d,\n", + NE10_FFT_CPX_TYPE_T_STR, mode->mdct.kfft[k]->nfft); + fprintf(file, "NULL,\n"); /* buffer */ + fprintf(file, "(%s *)&ne10_twiddles_%d[%d],\n", + NE10_FFT_CPX_TYPE_T_STR, mode->mdct.kfft[k]->nfft, cfg->nfft); +#if !defined(FIXED_POINT) + fprintf(file, "/* is_forward_scaled = true */\n"); + fprintf(file, "(ne10_int32_t) 1,\n"); + fprintf(file, "/* is_backward_scaled = false */\n"); + fprintf(file, "(ne10_int32_t) 0,\n"); +#endif + fprintf(file, "};\n"); + + fprintf(file, "static const arch_fft_state cfg_arch_%d = {\n", + mode->mdct.kfft[k]->nfft); + fprintf(file, "1,\n"); + fprintf(file, "(void *)&%s_%d,\n", + NE10_FFT_STATE_TYPE_T_STR, mode->mdct.kfft[k]->nfft); + fprintf(file, "};\n\n"); + } + fprintf(file, "#endif /* end NE10_FFT_PARAMS%d_%d */\n", mode->Fs, mdctSize); +} diff --git a/celt/fixed_c5x.h b/celt/fixed_c5x.h new file mode 100644 index 00000000..ea95a998 --- /dev/null +++ b/celt/fixed_c5x.h @@ -0,0 +1,79 @@ +/* Copyright (C) 2003 Jean-Marc Valin */ +/** + @file fixed_c5x.h + @brief Fixed-point operations for the TI C5x DSP family +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef FIXED_C5X_H +#define FIXED_C5X_H + +#include "dsplib.h" + +#undef IMUL32 +static OPUS_INLINE long IMUL32(long i, long j) +{ + long ac0, ac1; + ac0 = _lmpy(i>>16,j); + ac1 = ac0 + _lmpy(i,j>>16); + return _lmpyu(i,j) + (ac1<<16); +} + +#undef MAX16 +#define MAX16(a,b) _max(a,b) + +#undef MIN16 +#define MIN16(a,b) _min(a,b) + +#undef MAX32 +#define MAX32(a,b) _lmax(a,b) + +#undef MIN32 +#define MIN32(a,b) _lmin(a,b) + +#undef VSHR32 +#define VSHR32(a, shift) _lshl(a,-(shift)) + +#undef MULT16_16_Q15 +#define MULT16_16_Q15(a,b) (_smpy(a,b)) + +#undef MULT16_16SU +#define MULT16_16SU(a,b) _lmpysu(a,b) + +#undef MULT_16_16 +#define MULT_16_16(a,b) _lmpy(a,b) + +/* FIXME: This is technically incorrect and is bound to cause problems. Is there any cleaner solution? */ +#undef MULT16_32_Q15 +#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),(b)),15)) + +#define celt_ilog2(x) (30 - _lnorm(x)) +#define OVERRIDE_CELT_ILOG2 + +#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len))) +#define OVERRIDE_CELT_MAXABS16 + +#endif /* FIXED_C5X_H */ diff --git a/celt/fixed_c6x.h b/celt/fixed_c6x.h new file mode 100644 index 00000000..bb6ad927 --- /dev/null +++ b/celt/fixed_c6x.h @@ -0,0 +1,70 @@ +/* Copyright (C) 2008 CSIRO */ +/** + @file fixed_c6x.h + @brief Fixed-point operations for the TI C6x DSP family +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef FIXED_C6X_H +#define FIXED_C6X_H + +#undef MULT16_16SU +#define MULT16_16SU(a,b) _mpysu(a,b) + +#undef MULT_16_16 +#define MULT_16_16(a,b) _mpy(a,b) + +#define celt_ilog2(x) (30 - _norm(x)) +#define OVERRIDE_CELT_ILOG2 + +#undef MULT16_32_Q15 +#define MULT16_32_Q15(a,b) (_mpylill(a, b) >> 15) + +#if 0 +#include "dsplib.h" + +#undef MAX16 +#define MAX16(a,b) _max(a,b) + +#undef MIN16 +#define MIN16(a,b) _min(a,b) + +#undef MAX32 +#define MAX32(a,b) _lmax(a,b) + +#undef MIN32 +#define MIN32(a,b) _lmin(a,b) + +#undef VSHR32 +#define VSHR32(a, shift) _lshl(a,-(shift)) + +#undef MULT16_16_Q15 +#define MULT16_16_Q15(a,b) (_smpy(a,b)) + +#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len))) +#define OVERRIDE_CELT_MAXABS16 + +#endif /* FIXED_C6X_H */ |