aboutsummaryrefslogtreecommitdiff
path: root/files/source/rotate_neon.s
diff options
context:
space:
mode:
authorShri Borde <shri@google.com>2011-11-02 13:20:24 -0700
committerShri Borde <shri@google.com>2011-11-02 13:20:24 -0700
commit7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa (patch)
treeb33940212e8eae6d9df454f5461279da919629cf /files/source/rotate_neon.s
parent2398a6ec900d592b1433dc24eeeecf442794eb10 (diff)
downloadlibyuv-7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa.tar.gz
Initial population of libyuv
Change-Id: I46a6a1525aebaba979b0f2ca5b58be2004901410
Diffstat (limited to 'files/source/rotate_neon.s')
-rw-r--r--files/source/rotate_neon.s563
1 files changed, 563 insertions, 0 deletions
diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s
new file mode 100644
index 00000000..75ea957a
--- /dev/null
+++ b/files/source/rotate_neon.s
@@ -0,0 +1,563 @@
+ .global RestoreRegisters_NEON
+ .global ReverseLine_NEON
+ .global ReverseLineUV_NEON
+ .global SaveRegisters_NEON
+ .global TransposeWx8_NEON
+ .global TransposeUVWx8_NEON
+ .type RestoreRegisters_NEON, function
+ .type ReverseLine_NEON, function
+ .type ReverseLineUV_NEON, function
+ .type SaveRegisters_NEON, function
+ .type TransposeWx8_NEON, function
+ .type TransposeUVWx8_NEON, function
+
+@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
+@ r0 const uint8* src
+@ r1 uint8* dst
+@ r2 width
+ReverseLine_NEON:
+
+ @ compute where to start writing destination
+ add r1, r2 @ dst + width
+
+ @ work on segments that are multiples of 16
+ lsrs r3, r2, #4
+
+ @ the output is written in two block. 8 bytes followed
+ @ by another 8. reading is done sequentially, from left to
+ @ right. writing is done from right to left in block sizes
+ @ r1, the destination pointer is incremented after writing
+ @ the first of the two blocks. need to subtract that 8 off
+ @ along with 16 to get the next location.
+ mov r3, #-24
+
+ beq Lline_residuals
+
+ @ back of destination by the size of the register that is
+ @ going to be reversed
+ sub r1, #16
+
+ @ the loop needs to run on blocks of 16. what will be left
+ @ over is either a negative number, the residuals that need
+ @ to be done, or 0. if this isn't subtracted off here the
+ @ loop will run one extra time.
+ sub r2, #16
+
+Lsegments_of_16:
+ vld1.8 {q0}, [r0]! @ src += 16
+
+ @ reverse the bytes in the 64 bit segments. unable to reverse
+ @ the bytes in the entire 128 bits in one go.
+ vrev64.8 q0, q0
+
+ @ because of the inability to reverse the entire 128 bits
+ @ reverse the writing out of the two 64 bit segments.
+ vst1.8 {d1}, [r1]!
+ vst1.8 {d0}, [r1], r3 @ dst -= 16
+
+ subs r2, #16
+ bge Lsegments_of_16
+
+ @ add 16 back to the counter. if the result is 0 there is no
+ @ residuals so return
+ adds r2, #16
+ bxeq lr
+
+ add r1, #16
+
+Lline_residuals:
+
+ mov r3, #-3
+
+ sub r1, #2
+ subs r2, #2
+ @ check for 16*n+1 scenarios where segments_of_2 should not
+ @ be run, but there is something left over.
+ blt Lsegment_of_1
+
+@ do this in neon registers as per
+@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+Lsegments_of_2:
+ vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2
+
+ vst1.8 {d1[0]}, [r1]!
+ vst1.8 {d0[0]}, [r1], r3 @ dst -= 2
+
+ subs r2, #2
+ bge Lsegments_of_2
+
+ adds r2, #2
+ bxeq lr
+
+Lsegment_of_1:
+ add r1, #1
+ vld1.8 {d0[0]}, [r0]
+ vst1.8 {d0[0]}, [r1]
+
+ bx lr
+
+@ void TransposeWx8_NEON (const uint8* src, int src_stride,
+@ uint8* dst, int dst_stride,
+@ int w)
+@ r0 const uint8* src
+@ r1 int src_stride
+@ r2 uint8* dst
+@ r3 int dst_stride
+@ stack int w
+TransposeWx8_NEON:
+ push {r4,r8,r9,lr}
+
+ ldr r8, [sp, #16] @ width
+
+ @ loops are on blocks of 8. loop will stop when
+ @ counter gets to or below 0. starting the counter
+ @ at w-8 allow for this
+ sub r8, #8
+
+@ handle 8x8 blocks. this should be the majority of the plane
+Lloop_8x8:
+ mov r9, r0
+
+ vld1.8 {d0}, [r9], r1
+ vld1.8 {d1}, [r9], r1
+ vld1.8 {d2}, [r9], r1
+ vld1.8 {d3}, [r9], r1
+ vld1.8 {d4}, [r9], r1
+ vld1.8 {d5}, [r9], r1
+ vld1.8 {d6}, [r9], r1
+ vld1.8 {d7}, [r9]
+
+ vtrn.8 d1, d0
+ vtrn.8 d3, d2
+ vtrn.8 d5, d4
+ vtrn.8 d7, d6
+
+ vtrn.16 d1, d3
+ vtrn.16 d0, d2
+ vtrn.16 d5, d7
+ vtrn.16 d4, d6
+
+ vtrn.32 d1, d5
+ vtrn.32 d0, d4
+ vtrn.32 d3, d7
+ vtrn.32 d2, d6
+
+ vrev16.8 q0, q0
+ vrev16.8 q1, q1
+ vrev16.8 q2, q2
+ vrev16.8 q3, q3
+
+ mov r9, r2
+
+ vst1.8 {d1}, [r9], r3
+ vst1.8 {d0}, [r9], r3
+ vst1.8 {d3}, [r9], r3
+ vst1.8 {d2}, [r9], r3
+ vst1.8 {d5}, [r9], r3
+ vst1.8 {d4}, [r9], r3
+ vst1.8 {d7}, [r9], r3
+ vst1.8 {d6}, [r9]
+
+ add r0, #8 @ src += 8
+ add r2, r3, lsl #3 @ dst += 8 * dst_stride
+ subs r8, #8 @ w -= 8
+ bge Lloop_8x8
+
+ @ add 8 back to counter. if the result is 0 there are
+ @ no residuals.
+ adds r8, #8
+ beq Ldone
+
+ @ some residual, so between 1 and 7 lines left to transpose
+ cmp r8, #2
+ blt Lblock_1x8
+
+ cmp r8, #4
+ blt Lblock_2x8
+
+Lblock_4x8:
+ mov r9, r0
+ vld1.32 {d0[0]}, [r9], r1
+ vld1.32 {d0[1]}, [r9], r1
+ vld1.32 {d1[0]}, [r9], r1
+ vld1.32 {d1[1]}, [r9], r1
+ vld1.32 {d2[0]}, [r9], r1
+ vld1.32 {d2[1]}, [r9], r1
+ vld1.32 {d3[0]}, [r9], r1
+ vld1.32 {d3[1]}, [r9]
+
+ mov r9, r2
+
+ adr r12, vtbl_4x4_transpose
+ vld1.8 {q3}, [r12]
+
+ vtbl.8 d4, {d0, d1}, d6
+ vtbl.8 d5, {d0, d1}, d7
+ vtbl.8 d0, {d2, d3}, d6
+ vtbl.8 d1, {d2, d3}, d7
+
+ @ TODO: rework shuffle above to write
+ @ out with 4 instead of 8 writes
+ vst1.32 {d4[0]}, [r9], r3
+ vst1.32 {d4[1]}, [r9], r3
+ vst1.32 {d5[0]}, [r9], r3
+ vst1.32 {d5[1]}, [r9]
+
+ add r9, r2, #4
+ vst1.32 {d0[0]}, [r9], r3
+ vst1.32 {d0[1]}, [r9], r3
+ vst1.32 {d1[0]}, [r9], r3
+ vst1.32 {d1[1]}, [r9]
+
+ add r0, #4 @ src += 4
+ add r2, r3, lsl #2 @ dst += 4 * dst_stride
+ subs r8, #4 @ w -= 4
+ beq Ldone
+
+ @ some residual, check to see if it includes a 2x8 block,
+ @ or less
+ cmp r8, #2
+ blt Lblock_1x8
+
+Lblock_2x8:
+ mov r9, r0
+ vld1.16 {d0[0]}, [r9], r1
+ vld1.16 {d1[0]}, [r9], r1
+ vld1.16 {d0[1]}, [r9], r1
+ vld1.16 {d1[1]}, [r9], r1
+ vld1.16 {d0[2]}, [r9], r1
+ vld1.16 {d1[2]}, [r9], r1
+ vld1.16 {d0[3]}, [r9], r1
+ vld1.16 {d1[3]}, [r9]
+
+ vtrn.8 d0, d1
+
+ mov r9, r2
+
+ vst1.64 {d0}, [r9], r3
+ vst1.64 {d1}, [r9]
+
+ add r0, #2 @ src += 2
+ add r2, r3, lsl #1 @ dst += 2 * dst_stride
+ subs r8, #2 @ w -= 2
+ beq Ldone
+
+Lblock_1x8:
+ vld1.8 {d0[0]}, [r0], r1
+ vld1.8 {d0[1]}, [r0], r1
+ vld1.8 {d0[2]}, [r0], r1
+ vld1.8 {d0[3]}, [r0], r1
+ vld1.8 {d0[4]}, [r0], r1
+ vld1.8 {d0[5]}, [r0], r1
+ vld1.8 {d0[6]}, [r0], r1
+ vld1.8 {d0[7]}, [r0]
+
+ vst1.64 {d0}, [r2]
+
+Ldone:
+
+ pop {r4,r8,r9,pc}
+
+vtbl_4x4_transpose:
+ .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+
+@ void SaveRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+SaveRegisters_NEON:
+ vst1.i64 {d8, d9, d10, d11}, [r0]!
+ vst1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+@ void RestoreRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+RestoreRegisters_NEON:
+ vld1.i64 {d8, d9, d10, d11}, [r0]!
+ vld1.i64 {d12, d13, d14, d15}, [r0]!
+ bx lr
+
+@ void ReverseLineUV_NEON (const uint8* src,
+@ uint8* dst_a,
+@ uint8* dst_b,
+@ int width)
+@ r0 const uint8* src
+@ r1 uint8* dst_a
+@ r2 uint8* dst_b
+@ r3 width
+ReverseLineUV_NEON:
+
+ @ compute where to start writing destination
+ add r1, r1, r3 @ dst_a + width
+ add r2, r2, r3 @ dst_b + width
+
+ @ work on input segments that are multiples of 16, but
+ @ width that has been passed is output segments, half
+ @ the size of input.
+ lsrs r12, r3, #3
+
+ beq Lline_residuals_di
+
+ @ the output is written in to two blocks.
+ mov r12, #-8
+
+ @ back of destination by the size of the register that is
+ @ going to be reversed
+ sub r1, r1, #8
+ sub r2, r2, #8
+
+ @ the loop needs to run on blocks of 8. what will be left
+ @ over is either a negative number, the residuals that need
+ @ to be done, or 0. if this isn't subtracted off here the
+ @ loop will run one extra time.
+ sub r3, r3, #8
+
+Lsegments_of_8_di:
+ vld2.8 {d0, d1}, [r0]! @ src += 16
+
+ @ reverse the bytes in the 64 bit segments
+ vrev64.8 q0, q0
+
+ vst1.8 {d0}, [r1], r12 @ dst_a -= 8
+ vst1.8 {d1}, [r2], r12 @ dst_b -= 8
+
+ subs r3, r3, #8
+ bge Lsegments_of_8_di
+
+ @ add 8 back to the counter. if the result is 0 there is no
+ @ residuals so return
+ adds r3, r3, #8
+ bxeq lr
+
+ add r1, r1, #8
+ add r2, r2, #8
+
+Lline_residuals_di:
+
+ mov r12, #-1
+
+ sub r1, r1, #1
+ sub r2, r2, #1
+
+@ do this in neon registers as per
+@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+Lsegments_of_1:
+ vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2
+
+ vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1
+ vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1
+
+ subs r3, r3, #1
+ bgt Lsegments_of_1
+
+ bx lr
+
+@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
+@ uint8* dst_a, int dst_stride_a,
+@ uint8* dst_b, int dst_stride_b,
+@ int width)
+@ r0 const uint8* src
+@ r1 int src_stride
+@ r2 uint8* dst_a
+@ r3 int dst_stride_a
+@ stack uint8* dst_b
+@ stack int dst_stride_b
+@ stack int width
+TransposeUVWx8_NEON:
+ push {r4-r9,lr}
+
+ ldr r4, [sp, #28] @ dst_b
+ ldr r5, [sp, #32] @ dst_stride_b
+ ldr r8, [sp, #36] @ width
+ @ loops are on blocks of 8. loop will stop when
+ @ counter gets to or below 0. starting the counter
+ @ at w-8 allow for this
+ sub r8, #8
+
+@ handle 8x8 blocks. this should be the majority of the plane
+Lloop_8x8_di:
+ mov r9, r0
+
+ vld2.8 {d0, d1}, [r9], r1
+ vld2.8 {d2, d3}, [r9], r1
+ vld2.8 {d4, d5}, [r9], r1
+ vld2.8 {d6, d7}, [r9], r1
+ vld2.8 {d8, d9}, [r9], r1
+ vld2.8 {d10, d11}, [r9], r1
+ vld2.8 {d12, d13}, [r9], r1
+ vld2.8 {d14, d15}, [r9]
+
+ vtrn.8 q1, q0
+ vtrn.8 q3, q2
+ vtrn.8 q5, q4
+ vtrn.8 q7, q6
+
+ vtrn.16 q1, q3
+ vtrn.16 q0, q2
+ vtrn.16 q5, q7
+ vtrn.16 q4, q6
+
+ vtrn.32 q1, q5
+ vtrn.32 q0, q4
+ vtrn.32 q3, q7
+ vtrn.32 q2, q6
+
+ vrev16.8 q0, q0
+ vrev16.8 q1, q1
+ vrev16.8 q2, q2
+ vrev16.8 q3, q3
+ vrev16.8 q4, q4
+ vrev16.8 q5, q5
+ vrev16.8 q6, q6
+ vrev16.8 q7, q7
+
+ mov r9, r2
+
+ vst1.8 {d2}, [r9], r3
+ vst1.8 {d0}, [r9], r3
+ vst1.8 {d6}, [r9], r3
+ vst1.8 {d4}, [r9], r3
+ vst1.8 {d10}, [r9], r3
+ vst1.8 {d8}, [r9], r3
+ vst1.8 {d14}, [r9], r3
+ vst1.8 {d12}, [r9]
+
+ mov r9, r4
+
+ vst1.8 {d3}, [r9], r5
+ vst1.8 {d1}, [r9], r5
+ vst1.8 {d7}, [r9], r5
+ vst1.8 {d5}, [r9], r5
+ vst1.8 {d11}, [r9], r5
+ vst1.8 {d9}, [r9], r5
+ vst1.8 {d15}, [r9], r5
+ vst1.8 {d13}, [r9]
+
+ add r0, #8*2 @ src += 8*2
+ add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a
+ add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b
+ subs r8, #8 @ w -= 8
+ bge Lloop_8x8_di
+
+ @ add 8 back to counter. if the result is 0 there are
+ @ no residuals.
+ adds r8, #8
+ beq Ldone_di
+
+ @ some residual, so between 1 and 7 lines left to transpose
+ cmp r8, #2
+ blt Lblock_1x8_di
+
+ cmp r8, #4
+ blt Lblock_2x8_di
+
+@ TODO(frkoenig) : clean this up
+Lblock_4x8_di:
+ mov r9, r0
+ vld1.64 {d0}, [r9], r1
+ vld1.64 {d1}, [r9], r1
+ vld1.64 {d2}, [r9], r1
+ vld1.64 {d3}, [r9], r1
+ vld1.64 {d4}, [r9], r1
+ vld1.64 {d5}, [r9], r1
+ vld1.64 {d6}, [r9], r1
+ vld1.64 {d7}, [r9]
+
+ adr r12, vtbl_4x4_transpose_di
+ vld1.8 {q7}, [r12]
+
+ vtrn.8 q0, q1
+ vtrn.8 q2, q3
+
+ vtbl.8 d8, {d0, d1}, d14
+ vtbl.8 d9, {d0, d1}, d15
+ vtbl.8 d10, {d2, d3}, d14
+ vtbl.8 d11, {d2, d3}, d15
+ vtbl.8 d12, {d4, d5}, d14
+ vtbl.8 d13, {d4, d5}, d15
+ vtbl.8 d0, {d6, d7}, d14
+ vtbl.8 d1, {d6, d7}, d15
+
+ mov r9, r2
+
+ vst1.32 {d8[0]}, [r9], r3
+ vst1.32 {d8[1]}, [r9], r3
+ vst1.32 {d9[0]}, [r9], r3
+ vst1.32 {d9[1]}, [r9], r3
+
+ add r9, r2, #4
+ vst1.32 {d12[0]}, [r9], r3
+ vst1.32 {d12[1]}, [r9], r3
+ vst1.32 {d13[0]}, [r9], r3
+ vst1.32 {d13[1]}, [r9]
+
+ mov r9, r4
+
+ vst1.32 {d10[0]}, [r9], r5
+ vst1.32 {d10[1]}, [r9], r5
+ vst1.32 {d11[0]}, [r9], r5
+ vst1.32 {d11[1]}, [r9], r5
+
+ add r9, r4, #4
+ vst1.32 {d0[0]}, [r9], r5
+ vst1.32 {d0[1]}, [r9], r5
+ vst1.32 {d1[0]}, [r9], r5
+ vst1.32 {d1[1]}, [r9]
+
+ add r0, #4*2 @ src += 4 * 2
+ add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a
+ add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b
+ subs r8, #4 @ w -= 4
+ beq Ldone_di
+
+ @ some residual, check to see if it includes a 2x8 block,
+ @ or less
+ cmp r8, #2
+ blt Lblock_1x8_di
+
+Lblock_2x8_di:
+ mov r9, r0
+ vld2.16 {d0[0], d2[0]}, [r9], r1
+ vld2.16 {d1[0], d3[0]}, [r9], r1
+ vld2.16 {d0[1], d2[1]}, [r9], r1
+ vld2.16 {d1[1], d3[1]}, [r9], r1
+ vld2.16 {d0[2], d2[2]}, [r9], r1
+ vld2.16 {d1[2], d3[2]}, [r9], r1
+ vld2.16 {d0[3], d2[3]}, [r9], r1
+ vld2.16 {d1[3], d3[3]}, [r9]
+
+ vtrn.8 d0, d1
+ vtrn.8 d2, d3
+
+ mov r9, r2
+
+ vst1.64 {d0}, [r9], r3
+ vst1.64 {d2}, [r9]
+
+ mov r9, r4
+
+ vst1.64 {d1}, [r9], r5
+ vst1.64 {d3}, [r9]
+
+ add r0, #2*2 @ src += 2 * 2
+ add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a
+ add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a
+ subs r8, #2 @ w -= 2
+ beq Ldone_di
+
+Lblock_1x8_di:
+ vld2.8 {d0[0], d1[0]}, [r0], r1
+ vld2.8 {d0[1], d1[1]}, [r0], r1
+ vld2.8 {d0[2], d1[2]}, [r0], r1
+ vld2.8 {d0[3], d1[3]}, [r0], r1
+ vld2.8 {d0[4], d1[4]}, [r0], r1
+ vld2.8 {d0[5], d1[5]}, [r0], r1
+ vld2.8 {d0[6], d1[6]}, [r0], r1
+ vld2.8 {d0[7], d1[7]}, [r0]
+
+ vst1.64 {d0}, [r2]
+ vst1.64 {d1}, [r4]
+
+Ldone_di:
+ pop {r4-r9, pc}
+
+vtbl_4x4_transpose_di:
+ .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15