diff options
author | Shri Borde <shri@google.com> | 2011-11-02 13:20:24 -0700 |
---|---|---|
committer | Shri Borde <shri@google.com> | 2011-11-02 13:20:24 -0700 |
commit | 7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa (patch) | |
tree | b33940212e8eae6d9df454f5461279da919629cf /files/source/rotate_neon.s | |
parent | 2398a6ec900d592b1433dc24eeeecf442794eb10 (diff) | |
download | libyuv-7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa.tar.gz |
Initial population of libyuv
Change-Id: I46a6a1525aebaba979b0f2ca5b58be2004901410
Diffstat (limited to 'files/source/rotate_neon.s')
-rw-r--r-- | files/source/rotate_neon.s | 563 |
1 files changed, 563 insertions, 0 deletions
diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s new file mode 100644 index 00000000..75ea957a --- /dev/null +++ b/files/source/rotate_neon.s @@ -0,0 +1,563 @@ + .global RestoreRegisters_NEON + .global ReverseLine_NEON + .global ReverseLineUV_NEON + .global SaveRegisters_NEON + .global TransposeWx8_NEON + .global TransposeUVWx8_NEON + .type RestoreRegisters_NEON, function + .type ReverseLine_NEON, function + .type ReverseLineUV_NEON, function + .type SaveRegisters_NEON, function + .type TransposeWx8_NEON, function + .type TransposeUVWx8_NEON, function + +@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width) +@ r0 const uint8* src +@ r1 uint8* dst +@ r2 width +ReverseLine_NEON: + + @ compute where to start writing destination + add r1, r2 @ dst + width + + @ work on segments that are multiples of 16 + lsrs r3, r2, #4 + + @ the output is written in two block. 8 bytes followed + @ by another 8. reading is done sequentially, from left to + @ right. writing is done from right to left in block sizes + @ r1, the destination pointer is incremented after writing + @ the first of the two blocks. need to subtract that 8 off + @ along with 16 to get the next location. + mov r3, #-24 + + beq Lline_residuals + + @ back of destination by the size of the register that is + @ going to be reversed + sub r1, #16 + + @ the loop needs to run on blocks of 16. what will be left + @ over is either a negative number, the residuals that need + @ to be done, or 0. if this isn't subtracted off here the + @ loop will run one extra time. + sub r2, #16 + +Lsegments_of_16: + vld1.8 {q0}, [r0]! @ src += 16 + + @ reverse the bytes in the 64 bit segments. unable to reverse + @ the bytes in the entire 128 bits in one go. + vrev64.8 q0, q0 + + @ because of the inability to reverse the entire 128 bits + @ reverse the writing out of the two 64 bit segments. + vst1.8 {d1}, [r1]! + vst1.8 {d0}, [r1], r3 @ dst -= 16 + + subs r2, #16 + bge Lsegments_of_16 + + @ add 16 back to the counter. if the result is 0 there is no + @ residuals so return + adds r2, #16 + bxeq lr + + add r1, #16 + +Lline_residuals: + + mov r3, #-3 + + sub r1, #2 + subs r2, #2 + @ check for 16*n+1 scenarios where segments_of_2 should not + @ be run, but there is something left over. + blt Lsegment_of_1 + +@ do this in neon registers as per +@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ +Lsegments_of_2: + vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 + + vst1.8 {d1[0]}, [r1]! + vst1.8 {d0[0]}, [r1], r3 @ dst -= 2 + + subs r2, #2 + bge Lsegments_of_2 + + adds r2, #2 + bxeq lr + +Lsegment_of_1: + add r1, #1 + vld1.8 {d0[0]}, [r0] + vst1.8 {d0[0]}, [r1] + + bx lr + +@ void TransposeWx8_NEON (const uint8* src, int src_stride, +@ uint8* dst, int dst_stride, +@ int w) +@ r0 const uint8* src +@ r1 int src_stride +@ r2 uint8* dst +@ r3 int dst_stride +@ stack int w +TransposeWx8_NEON: + push {r4,r8,r9,lr} + + ldr r8, [sp, #16] @ width + + @ loops are on blocks of 8. loop will stop when + @ counter gets to or below 0. starting the counter + @ at w-8 allow for this + sub r8, #8 + +@ handle 8x8 blocks. this should be the majority of the plane +Lloop_8x8: + mov r9, r0 + + vld1.8 {d0}, [r9], r1 + vld1.8 {d1}, [r9], r1 + vld1.8 {d2}, [r9], r1 + vld1.8 {d3}, [r9], r1 + vld1.8 {d4}, [r9], r1 + vld1.8 {d5}, [r9], r1 + vld1.8 {d6}, [r9], r1 + vld1.8 {d7}, [r9] + + vtrn.8 d1, d0 + vtrn.8 d3, d2 + vtrn.8 d5, d4 + vtrn.8 d7, d6 + + vtrn.16 d1, d3 + vtrn.16 d0, d2 + vtrn.16 d5, d7 + vtrn.16 d4, d6 + + vtrn.32 d1, d5 + vtrn.32 d0, d4 + vtrn.32 d3, d7 + vtrn.32 d2, d6 + + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vrev16.8 q2, q2 + vrev16.8 q3, q3 + + mov r9, r2 + + vst1.8 {d1}, [r9], r3 + vst1.8 {d0}, [r9], r3 + vst1.8 {d3}, [r9], r3 + vst1.8 {d2}, [r9], r3 + vst1.8 {d5}, [r9], r3 + vst1.8 {d4}, [r9], r3 + vst1.8 {d7}, [r9], r3 + vst1.8 {d6}, [r9] + + add r0, #8 @ src += 8 + add r2, r3, lsl #3 @ dst += 8 * dst_stride + subs r8, #8 @ w -= 8 + bge Lloop_8x8 + + @ add 8 back to counter. if the result is 0 there are + @ no residuals. + adds r8, #8 + beq Ldone + + @ some residual, so between 1 and 7 lines left to transpose + cmp r8, #2 + blt Lblock_1x8 + + cmp r8, #4 + blt Lblock_2x8 + +Lblock_4x8: + mov r9, r0 + vld1.32 {d0[0]}, [r9], r1 + vld1.32 {d0[1]}, [r9], r1 + vld1.32 {d1[0]}, [r9], r1 + vld1.32 {d1[1]}, [r9], r1 + vld1.32 {d2[0]}, [r9], r1 + vld1.32 {d2[1]}, [r9], r1 + vld1.32 {d3[0]}, [r9], r1 + vld1.32 {d3[1]}, [r9] + + mov r9, r2 + + adr r12, vtbl_4x4_transpose + vld1.8 {q3}, [r12] + + vtbl.8 d4, {d0, d1}, d6 + vtbl.8 d5, {d0, d1}, d7 + vtbl.8 d0, {d2, d3}, d6 + vtbl.8 d1, {d2, d3}, d7 + + @ TODO: rework shuffle above to write + @ out with 4 instead of 8 writes + vst1.32 {d4[0]}, [r9], r3 + vst1.32 {d4[1]}, [r9], r3 + vst1.32 {d5[0]}, [r9], r3 + vst1.32 {d5[1]}, [r9] + + add r9, r2, #4 + vst1.32 {d0[0]}, [r9], r3 + vst1.32 {d0[1]}, [r9], r3 + vst1.32 {d1[0]}, [r9], r3 + vst1.32 {d1[1]}, [r9] + + add r0, #4 @ src += 4 + add r2, r3, lsl #2 @ dst += 4 * dst_stride + subs r8, #4 @ w -= 4 + beq Ldone + + @ some residual, check to see if it includes a 2x8 block, + @ or less + cmp r8, #2 + blt Lblock_1x8 + +Lblock_2x8: + mov r9, r0 + vld1.16 {d0[0]}, [r9], r1 + vld1.16 {d1[0]}, [r9], r1 + vld1.16 {d0[1]}, [r9], r1 + vld1.16 {d1[1]}, [r9], r1 + vld1.16 {d0[2]}, [r9], r1 + vld1.16 {d1[2]}, [r9], r1 + vld1.16 {d0[3]}, [r9], r1 + vld1.16 {d1[3]}, [r9] + + vtrn.8 d0, d1 + + mov r9, r2 + + vst1.64 {d0}, [r9], r3 + vst1.64 {d1}, [r9] + + add r0, #2 @ src += 2 + add r2, r3, lsl #1 @ dst += 2 * dst_stride + subs r8, #2 @ w -= 2 + beq Ldone + +Lblock_1x8: + vld1.8 {d0[0]}, [r0], r1 + vld1.8 {d0[1]}, [r0], r1 + vld1.8 {d0[2]}, [r0], r1 + vld1.8 {d0[3]}, [r0], r1 + vld1.8 {d0[4]}, [r0], r1 + vld1.8 {d0[5]}, [r0], r1 + vld1.8 {d0[6]}, [r0], r1 + vld1.8 {d0[7]}, [r0] + + vst1.64 {d0}, [r2] + +Ldone: + + pop {r4,r8,r9,pc} + +vtbl_4x4_transpose: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + +@ void SaveRegisters_NEON (unsigned long long store) +@ r0 unsigned long long store +SaveRegisters_NEON: + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + +@ void RestoreRegisters_NEON (unsigned long long store) +@ r0 unsigned long long store +RestoreRegisters_NEON: + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + +@ void ReverseLineUV_NEON (const uint8* src, +@ uint8* dst_a, +@ uint8* dst_b, +@ int width) +@ r0 const uint8* src +@ r1 uint8* dst_a +@ r2 uint8* dst_b +@ r3 width +ReverseLineUV_NEON: + + @ compute where to start writing destination + add r1, r1, r3 @ dst_a + width + add r2, r2, r3 @ dst_b + width + + @ work on input segments that are multiples of 16, but + @ width that has been passed is output segments, half + @ the size of input. + lsrs r12, r3, #3 + + beq Lline_residuals_di + + @ the output is written in to two blocks. + mov r12, #-8 + + @ back of destination by the size of the register that is + @ going to be reversed + sub r1, r1, #8 + sub r2, r2, #8 + + @ the loop needs to run on blocks of 8. what will be left + @ over is either a negative number, the residuals that need + @ to be done, or 0. if this isn't subtracted off here the + @ loop will run one extra time. + sub r3, r3, #8 + +Lsegments_of_8_di: + vld2.8 {d0, d1}, [r0]! @ src += 16 + + @ reverse the bytes in the 64 bit segments + vrev64.8 q0, q0 + + vst1.8 {d0}, [r1], r12 @ dst_a -= 8 + vst1.8 {d1}, [r2], r12 @ dst_b -= 8 + + subs r3, r3, #8 + bge Lsegments_of_8_di + + @ add 8 back to the counter. if the result is 0 there is no + @ residuals so return + adds r3, r3, #8 + bxeq lr + + add r1, r1, #8 + add r2, r2, #8 + +Lline_residuals_di: + + mov r12, #-1 + + sub r1, r1, #1 + sub r2, r2, #1 + +@ do this in neon registers as per +@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ +Lsegments_of_1: + vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 + + vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1 + vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1 + + subs r3, r3, #1 + bgt Lsegments_of_1 + + bx lr + +@ void TransposeUVWx8_NEON (const uint8* src, int src_stride, +@ uint8* dst_a, int dst_stride_a, +@ uint8* dst_b, int dst_stride_b, +@ int width) +@ r0 const uint8* src +@ r1 int src_stride +@ r2 uint8* dst_a +@ r3 int dst_stride_a +@ stack uint8* dst_b +@ stack int dst_stride_b +@ stack int width +TransposeUVWx8_NEON: + push {r4-r9,lr} + + ldr r4, [sp, #28] @ dst_b + ldr r5, [sp, #32] @ dst_stride_b + ldr r8, [sp, #36] @ width + @ loops are on blocks of 8. loop will stop when + @ counter gets to or below 0. starting the counter + @ at w-8 allow for this + sub r8, #8 + +@ handle 8x8 blocks. this should be the majority of the plane +Lloop_8x8_di: + mov r9, r0 + + vld2.8 {d0, d1}, [r9], r1 + vld2.8 {d2, d3}, [r9], r1 + vld2.8 {d4, d5}, [r9], r1 + vld2.8 {d6, d7}, [r9], r1 + vld2.8 {d8, d9}, [r9], r1 + vld2.8 {d10, d11}, [r9], r1 + vld2.8 {d12, d13}, [r9], r1 + vld2.8 {d14, d15}, [r9] + + vtrn.8 q1, q0 + vtrn.8 q3, q2 + vtrn.8 q5, q4 + vtrn.8 q7, q6 + + vtrn.16 q1, q3 + vtrn.16 q0, q2 + vtrn.16 q5, q7 + vtrn.16 q4, q6 + + vtrn.32 q1, q5 + vtrn.32 q0, q4 + vtrn.32 q3, q7 + vtrn.32 q2, q6 + + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vrev16.8 q2, q2 + vrev16.8 q3, q3 + vrev16.8 q4, q4 + vrev16.8 q5, q5 + vrev16.8 q6, q6 + vrev16.8 q7, q7 + + mov r9, r2 + + vst1.8 {d2}, [r9], r3 + vst1.8 {d0}, [r9], r3 + vst1.8 {d6}, [r9], r3 + vst1.8 {d4}, [r9], r3 + vst1.8 {d10}, [r9], r3 + vst1.8 {d8}, [r9], r3 + vst1.8 {d14}, [r9], r3 + vst1.8 {d12}, [r9] + + mov r9, r4 + + vst1.8 {d3}, [r9], r5 + vst1.8 {d1}, [r9], r5 + vst1.8 {d7}, [r9], r5 + vst1.8 {d5}, [r9], r5 + vst1.8 {d11}, [r9], r5 + vst1.8 {d9}, [r9], r5 + vst1.8 {d15}, [r9], r5 + vst1.8 {d13}, [r9] + + add r0, #8*2 @ src += 8*2 + add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a + add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b + subs r8, #8 @ w -= 8 + bge Lloop_8x8_di + + @ add 8 back to counter. if the result is 0 there are + @ no residuals. + adds r8, #8 + beq Ldone_di + + @ some residual, so between 1 and 7 lines left to transpose + cmp r8, #2 + blt Lblock_1x8_di + + cmp r8, #4 + blt Lblock_2x8_di + +@ TODO(frkoenig) : clean this up +Lblock_4x8_di: + mov r9, r0 + vld1.64 {d0}, [r9], r1 + vld1.64 {d1}, [r9], r1 + vld1.64 {d2}, [r9], r1 + vld1.64 {d3}, [r9], r1 + vld1.64 {d4}, [r9], r1 + vld1.64 {d5}, [r9], r1 + vld1.64 {d6}, [r9], r1 + vld1.64 {d7}, [r9] + + adr r12, vtbl_4x4_transpose_di + vld1.8 {q7}, [r12] + + vtrn.8 q0, q1 + vtrn.8 q2, q3 + + vtbl.8 d8, {d0, d1}, d14 + vtbl.8 d9, {d0, d1}, d15 + vtbl.8 d10, {d2, d3}, d14 + vtbl.8 d11, {d2, d3}, d15 + vtbl.8 d12, {d4, d5}, d14 + vtbl.8 d13, {d4, d5}, d15 + vtbl.8 d0, {d6, d7}, d14 + vtbl.8 d1, {d6, d7}, d15 + + mov r9, r2 + + vst1.32 {d8[0]}, [r9], r3 + vst1.32 {d8[1]}, [r9], r3 + vst1.32 {d9[0]}, [r9], r3 + vst1.32 {d9[1]}, [r9], r3 + + add r9, r2, #4 + vst1.32 {d12[0]}, [r9], r3 + vst1.32 {d12[1]}, [r9], r3 + vst1.32 {d13[0]}, [r9], r3 + vst1.32 {d13[1]}, [r9] + + mov r9, r4 + + vst1.32 {d10[0]}, [r9], r5 + vst1.32 {d10[1]}, [r9], r5 + vst1.32 {d11[0]}, [r9], r5 + vst1.32 {d11[1]}, [r9], r5 + + add r9, r4, #4 + vst1.32 {d0[0]}, [r9], r5 + vst1.32 {d0[1]}, [r9], r5 + vst1.32 {d1[0]}, [r9], r5 + vst1.32 {d1[1]}, [r9] + + add r0, #4*2 @ src += 4 * 2 + add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a + add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b + subs r8, #4 @ w -= 4 + beq Ldone_di + + @ some residual, check to see if it includes a 2x8 block, + @ or less + cmp r8, #2 + blt Lblock_1x8_di + +Lblock_2x8_di: + mov r9, r0 + vld2.16 {d0[0], d2[0]}, [r9], r1 + vld2.16 {d1[0], d3[0]}, [r9], r1 + vld2.16 {d0[1], d2[1]}, [r9], r1 + vld2.16 {d1[1], d3[1]}, [r9], r1 + vld2.16 {d0[2], d2[2]}, [r9], r1 + vld2.16 {d1[2], d3[2]}, [r9], r1 + vld2.16 {d0[3], d2[3]}, [r9], r1 + vld2.16 {d1[3], d3[3]}, [r9] + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + + mov r9, r2 + + vst1.64 {d0}, [r9], r3 + vst1.64 {d2}, [r9] + + mov r9, r4 + + vst1.64 {d1}, [r9], r5 + vst1.64 {d3}, [r9] + + add r0, #2*2 @ src += 2 * 2 + add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a + add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a + subs r8, #2 @ w -= 2 + beq Ldone_di + +Lblock_1x8_di: + vld2.8 {d0[0], d1[0]}, [r0], r1 + vld2.8 {d0[1], d1[1]}, [r0], r1 + vld2.8 {d0[2], d1[2]}, [r0], r1 + vld2.8 {d0[3], d1[3]}, [r0], r1 + vld2.8 {d0[4], d1[4]}, [r0], r1 + vld2.8 {d0[5], d1[5]}, [r0], r1 + vld2.8 {d0[6], d1[6]}, [r0], r1 + vld2.8 {d0[7], d1[7]}, [r0] + + vst1.64 {d0}, [r2] + vst1.64 {d1}, [r4] + +Ldone_di: + pop {r4-r9, pc} + +vtbl_4x4_transpose_di: + .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 |