Make celt_pitch_xcorr_edsp() work on ARMv5TE.

We were assuming that LDR, LDRD, and STRD could be used on unaligned addresses, but this turns out not to be true on really old hardware.
author: Timothy B. Terriberry <tterribe@xiph.org> 2013-11-26 21:51:39 -0800
committer: Timothy B. Terriberry <tterribe@xiph.org> 2013-11-26 21:58:35 -0800
commit: 5c02c5ffb58a08c1b705e6fa2ab73f631f658823 (patch)
tree: 4c6cf12db4fd4eb597b38d8210d540dfb25b91e2 /celt/arm
parent: aa8ff201b5f8c489c52f00d83cd188bb168c310b (diff)
download: libopus-5c02c5ffb58a08c1b705e6fa2ab73f631f658823.tar.gz
1 files changed, 132 insertions, 32 deletions
diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s
index 2db681d2..0f1bf5f6 100644
--- a/celt/arm/celt_pitch_xcorr_arm.s
+++ b/celt/arm/celt_pitch_xcorr_arm.s
@@ -412,8 +412,8 @@ IF OPUS_ARM_MAY_HAVE_EDSP
 xcorr_kernel_edsp PROC
   ; input:
   ;   r3      = int         len
-  ;   r4      = opus_val16 *_x
-  ;   r5      = opus_val16 *_y
+  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
+  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
   ;   r6...r9 = opus_val32  sum[4]
   ; output:
   ;   r6...r9 = opus_val32  sum[4]
@@ -423,8 +423,9 @@ xcorr_kernel_edsp PROC
   ;   r12,r14 = opus_val16  x[4]
   ;   r10,r11 = opus_val16  y[4]
   STMFD        sp!, {r2,r4,r5,lr}
+  LDR          r10, [r5], #4      ; Load y[0...1]
   SUBS         r2, r3, #4         ; j = len-4
-  LDRD         r10, r11, [r5], #8 ; Load y[0...3]
+  LDR          r11, [r5], #4      ; Load y[2...3]
   BLE xcorr_kernel_edsp_process4_done
   LDR          r12, [r4], #4      ; Load x[0...1]
   ; Stall
@@ -493,8 +494,8 @@ xcorr_kernel_edsp_done
 
 celt_pitch_xcorr_edsp PROC
   ; input:
-  ;   r0  = opus_val16 *_x
-  ;   r1  = opus_val16 *_y
+  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
+  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
   ;   r2  = opus_val32 *xcorr
   ;   r3  = int         len
   ; output:
@@ -512,11 +513,56 @@ celt_pitch_xcorr_edsp PROC
   MOV          r5, r1
   LDR          r1, [sp, #36]
   MOV          r4, r0
+  TST          r5, #3
   ; maxcorr = 1
   MOV          r0, #1
-  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process4_done
+  BEQ          celt_pitch_xcorr_edsp_process1u_done
+; Compute one sum at the start to make y 32-bit aligned.
+  SUBS         r12, r3, #4
+  ; r14 = sum = 0
+  MOV          r14, #0
+  LDRH         r8, [r5], #2
+  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+  LDR          r7, [r4], #4
+celt_pitch_xcorr_edsp_process1u_loop4
+  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
+  LDR          r10, [r5], #4
+  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
+  LDRGT        r6, [r4], #4
+  SMLATB       r14, r7, r10, r14    ; sum = MAC16_16(sum, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  MOV          r8, r10, LSR #16
+  LDRGT        r7, [r4], #4
+  BGT celt_pitch_xcorr_edsp_process1u_loop4
+celt_pitch_xcorr_edsp_process1u_loop4_done
+  ADDS         r12, r12, #4
+celt_pitch_xcorr_edsp_process1u_loop1
+  LDRGEH       r6, [r4], #2
+  ; Stall
+  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
+  SUBGES       r12, r12, #1
+  LDRGTH       r8, [r5], #2
+  BGT celt_pitch_xcorr_edsp_process1u_loop1
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum)
+  CMP          r0, r14
+  ADD          r5, r5, #2
+  MOVLT        r0, r14
+  SUBS         r1, r1, #1
+  ; xcorr[i] = sum
+  STR          r14, [r2], #4
+  BLE celt_pitch_xcorr_edsp_done
+celt_pitch_xcorr_edsp_process1u_done
+  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
   SUBS         r1, r1, #4
-  BLT celt_pitch_xcorr_edsp_process4_done
+  BLT celt_pitch_xcorr_edsp_process2
 celt_pitch_xcorr_edsp_process4
   ; xcorr_kernel_edsp parameters:
   ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
@@ -531,30 +577,93 @@ celt_pitch_xcorr_edsp_process4
   ADD          r5, r5, #8
   MOVLT        r0, r6
   CMP          r0, r7
-  STRD         r6, r7, [r2], #8
   MOVLT        r0, r7
   CMP          r0, r8
-  STRD         r8, r9, [r2], #8
   MOVLT        r0, r8
   CMP          r0, r9
   MOVLT        r0, r9
+  STMIA        r2!, {r6-r9}
   SUBS         r1, r1, #4
   BGE celt_pitch_xcorr_edsp_process4
-celt_pitch_xcorr_edsp_process4_done
-  ADDS         r1, r1, #4
-  BLE celt_pitch_xcorr_edsp_done
-; Now compute each remaining sum one at a time.
-celt_pitch_xcorr_edsp_process_remaining
+celt_pitch_xcorr_edsp_process2
+  ADDS         r1, r1, #2
+  BLT celt_pitch_xcorr_edsp_process1a
+  SUBS         r12, r3, #4
+  ; {r10, r11} = {sum0, sum1} = {0, 0}
+  MOV          r10, #0
+  MOV          r11, #0
+  LDR          r8, [r5], #4
+  BLE celt_pitch_xcorr_edsp_process2_loop_done
+  LDR          r6, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process2_loop4
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r7, [r4], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUBS         r12, r12, #4         ; j-=4
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  LDR          r8, [r5], #4
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+  LDRGT        r6, [r4], #4
+  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
+  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
+  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
+  LDRGT        r9, [r5], #4
+  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
+  BGT celt_pitch_xcorr_edsp_process2_loop4
+celt_pitch_xcorr_edsp_process2_loop_done
+  ADDS         r12, r12, #2
+  BLE  celt_pitch_xcorr_edsp_process2_1
+  LDR          r6, [r4], #4
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDR          r9, [r5], #4
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  SUB          r12, r12, #2
+  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
+  MOV          r8, r9
+  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
+celt_pitch_xcorr_edsp_process2_1
+  LDRH         r6, [r4], #2
+  ADDS         r12, r12, #1
+  ; Stall
+  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
+  LDRGTH       r7, [r4], #2
+  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
+  BLE celt_pitch_xcorr_edsp_process2_done
+  LDRH         r9, [r5], #2
+  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
+  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
+celt_pitch_xcorr_edsp_process2_done
+  ; Restore _x
+  SUB          r4, r4, r3, LSL #1
+  ; Restore and advance _y
+  SUB          r5, r5, r3, LSL #1
+  ; maxcorr = max(maxcorr, sum0)
+  CMP          r0, r10
+  ADD          r5, r5, #2
+  MOVLT        r0, r10
+  SUB          r1, r1, #2
+  ; maxcorr = max(maxcorr, sum1)
+  CMP          r0, r11
+  ; xcorr[i] = sum
+  STR          r10, [r2], #4
+  MOVLT        r0, r11
+  STR          r11, [r2], #4
+celt_pitch_xcorr_edsp_process1a
+  ADDS         r1, r1, #1
+  BLT celt_pitch_xcorr_edsp_done
   SUBS         r12, r3, #4
   ; r14 = sum = 0
   MOV          r14, #0
-  BLT celt_pitch_xcorr_edsp_process_remaining_loop_done
-  LDRD         r6, r7, [r4], #8
-  LDRD         r8, r9, [r5], #8
-  ; Stall
-celt_pitch_xcorr_edsp_process_remaining_loop4
+  BLT celt_pitch_xcorr_edsp_process1a_loop_done
+  LDR          r6, [r4], #4
+  LDR          r8, [r5], #4
+  LDR          r7, [r4], #4
+  LDR          r9, [r5], #4
+celt_pitch_xcorr_edsp_process1a_loop4
   SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
-  SUBS         r12, r12, #4         ; j--
+  SUBS         r12, r12, #4         ; j-=4
   SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
   LDRGE        r6, [r4], #4
   SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
@@ -562,8 +671,8 @@ celt_pitch_xcorr_edsp_process_remaining_loop4
   SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
   LDRGE        r7, [r4], #4
   LDRGE        r9, [r5], #4
-  BGE celt_pitch_xcorr_edsp_process_remaining_loop4
-celt_pitch_xcorr_edsp_process_remaining_loop_done
+  BGE celt_pitch_xcorr_edsp_process1a_loop4
+celt_pitch_xcorr_edsp_process1a_loop_done
   ADDS         r12, r12, #2
   LDRGE        r6, [r4], #4
   LDRGE        r8, [r5], #4
@@ -574,21 +683,12 @@ celt_pitch_xcorr_edsp_process_remaining_loop_done
   ADDS         r12, r12, #1
   LDRGEH       r6, [r4], #2
   LDRGEH       r8, [r5], #2
-  ; Restore _x
-  SUB          r4, r4, r3, LSL #1
-  ; Stall
   SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
-  ; Restore and advance _y
-  SUB          r5, r5, r3, LSL #1
   ; maxcorr = max(maxcorr, sum)
-  ; Stall
   CMP          r0, r14
-  ADD          r5, r5, #2
-  MOVLT        r0, r14
-  SUBS         r1, r1, #1
   ; xcorr[i] = sum
   STR          r14, [r2], #4
-  BGT celt_pitch_xcorr_edsp_process_remaining
+  MOVLT        r0, r14
 celt_pitch_xcorr_edsp_done
   LDMFD        sp!, {r4-r11, pc}
   ENDP
author	Timothy B. Terriberry <tterribe@xiph.org>	2013-11-26 21:51:39 -0800
committer	Timothy B. Terriberry <tterribe@xiph.org>	2013-11-26 21:58:35 -0800
commit	5c02c5ffb58a08c1b705e6fa2ab73f631f658823 (patch)
tree	4c6cf12db4fd4eb597b38d8210d540dfb25b91e2 /celt/arm
parent	aa8ff201b5f8c489c52f00d83cd188bb168c310b (diff)
download	libopus-5c02c5ffb58a08c1b705e6fa2ab73f631f658823.tar.gz