diff options
Diffstat (limited to 'files/source/rotate_neon.s')
-rw-r--r-- | files/source/rotate_neon.s | 563 |
1 files changed, 0 insertions, 563 deletions
diff --git a/files/source/rotate_neon.s b/files/source/rotate_neon.s deleted file mode 100644 index 75ea957a..00000000 --- a/files/source/rotate_neon.s +++ /dev/null @@ -1,563 +0,0 @@ - .global RestoreRegisters_NEON - .global ReverseLine_NEON - .global ReverseLineUV_NEON - .global SaveRegisters_NEON - .global TransposeWx8_NEON - .global TransposeUVWx8_NEON - .type RestoreRegisters_NEON, function - .type ReverseLine_NEON, function - .type ReverseLineUV_NEON, function - .type SaveRegisters_NEON, function - .type TransposeWx8_NEON, function - .type TransposeUVWx8_NEON, function - -@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width) -@ r0 const uint8* src -@ r1 uint8* dst -@ r2 width -ReverseLine_NEON: - - @ compute where to start writing destination - add r1, r2 @ dst + width - - @ work on segments that are multiples of 16 - lsrs r3, r2, #4 - - @ the output is written in two block. 8 bytes followed - @ by another 8. reading is done sequentially, from left to - @ right. writing is done from right to left in block sizes - @ r1, the destination pointer is incremented after writing - @ the first of the two blocks. need to subtract that 8 off - @ along with 16 to get the next location. - mov r3, #-24 - - beq Lline_residuals - - @ back of destination by the size of the register that is - @ going to be reversed - sub r1, #16 - - @ the loop needs to run on blocks of 16. what will be left - @ over is either a negative number, the residuals that need - @ to be done, or 0. if this isn't subtracted off here the - @ loop will run one extra time. - sub r2, #16 - -Lsegments_of_16: - vld1.8 {q0}, [r0]! @ src += 16 - - @ reverse the bytes in the 64 bit segments. unable to reverse - @ the bytes in the entire 128 bits in one go. - vrev64.8 q0, q0 - - @ because of the inability to reverse the entire 128 bits - @ reverse the writing out of the two 64 bit segments. - vst1.8 {d1}, [r1]! - vst1.8 {d0}, [r1], r3 @ dst -= 16 - - subs r2, #16 - bge Lsegments_of_16 - - @ add 16 back to the counter. if the result is 0 there is no - @ residuals so return - adds r2, #16 - bxeq lr - - add r1, #16 - -Lline_residuals: - - mov r3, #-3 - - sub r1, #2 - subs r2, #2 - @ check for 16*n+1 scenarios where segments_of_2 should not - @ be run, but there is something left over. - blt Lsegment_of_1 - -@ do this in neon registers as per -@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ -Lsegments_of_2: - vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 - - vst1.8 {d1[0]}, [r1]! - vst1.8 {d0[0]}, [r1], r3 @ dst -= 2 - - subs r2, #2 - bge Lsegments_of_2 - - adds r2, #2 - bxeq lr - -Lsegment_of_1: - add r1, #1 - vld1.8 {d0[0]}, [r0] - vst1.8 {d0[0]}, [r1] - - bx lr - -@ void TransposeWx8_NEON (const uint8* src, int src_stride, -@ uint8* dst, int dst_stride, -@ int w) -@ r0 const uint8* src -@ r1 int src_stride -@ r2 uint8* dst -@ r3 int dst_stride -@ stack int w -TransposeWx8_NEON: - push {r4,r8,r9,lr} - - ldr r8, [sp, #16] @ width - - @ loops are on blocks of 8. loop will stop when - @ counter gets to or below 0. starting the counter - @ at w-8 allow for this - sub r8, #8 - -@ handle 8x8 blocks. this should be the majority of the plane -Lloop_8x8: - mov r9, r0 - - vld1.8 {d0}, [r9], r1 - vld1.8 {d1}, [r9], r1 - vld1.8 {d2}, [r9], r1 - vld1.8 {d3}, [r9], r1 - vld1.8 {d4}, [r9], r1 - vld1.8 {d5}, [r9], r1 - vld1.8 {d6}, [r9], r1 - vld1.8 {d7}, [r9] - - vtrn.8 d1, d0 - vtrn.8 d3, d2 - vtrn.8 d5, d4 - vtrn.8 d7, d6 - - vtrn.16 d1, d3 - vtrn.16 d0, d2 - vtrn.16 d5, d7 - vtrn.16 d4, d6 - - vtrn.32 d1, d5 - vtrn.32 d0, d4 - vtrn.32 d3, d7 - vtrn.32 d2, d6 - - vrev16.8 q0, q0 - vrev16.8 q1, q1 - vrev16.8 q2, q2 - vrev16.8 q3, q3 - - mov r9, r2 - - vst1.8 {d1}, [r9], r3 - vst1.8 {d0}, [r9], r3 - vst1.8 {d3}, [r9], r3 - vst1.8 {d2}, [r9], r3 - vst1.8 {d5}, [r9], r3 - vst1.8 {d4}, [r9], r3 - vst1.8 {d7}, [r9], r3 - vst1.8 {d6}, [r9] - - add r0, #8 @ src += 8 - add r2, r3, lsl #3 @ dst += 8 * dst_stride - subs r8, #8 @ w -= 8 - bge Lloop_8x8 - - @ add 8 back to counter. if the result is 0 there are - @ no residuals. - adds r8, #8 - beq Ldone - - @ some residual, so between 1 and 7 lines left to transpose - cmp r8, #2 - blt Lblock_1x8 - - cmp r8, #4 - blt Lblock_2x8 - -Lblock_4x8: - mov r9, r0 - vld1.32 {d0[0]}, [r9], r1 - vld1.32 {d0[1]}, [r9], r1 - vld1.32 {d1[0]}, [r9], r1 - vld1.32 {d1[1]}, [r9], r1 - vld1.32 {d2[0]}, [r9], r1 - vld1.32 {d2[1]}, [r9], r1 - vld1.32 {d3[0]}, [r9], r1 - vld1.32 {d3[1]}, [r9] - - mov r9, r2 - - adr r12, vtbl_4x4_transpose - vld1.8 {q3}, [r12] - - vtbl.8 d4, {d0, d1}, d6 - vtbl.8 d5, {d0, d1}, d7 - vtbl.8 d0, {d2, d3}, d6 - vtbl.8 d1, {d2, d3}, d7 - - @ TODO: rework shuffle above to write - @ out with 4 instead of 8 writes - vst1.32 {d4[0]}, [r9], r3 - vst1.32 {d4[1]}, [r9], r3 - vst1.32 {d5[0]}, [r9], r3 - vst1.32 {d5[1]}, [r9] - - add r9, r2, #4 - vst1.32 {d0[0]}, [r9], r3 - vst1.32 {d0[1]}, [r9], r3 - vst1.32 {d1[0]}, [r9], r3 - vst1.32 {d1[1]}, [r9] - - add r0, #4 @ src += 4 - add r2, r3, lsl #2 @ dst += 4 * dst_stride - subs r8, #4 @ w -= 4 - beq Ldone - - @ some residual, check to see if it includes a 2x8 block, - @ or less - cmp r8, #2 - blt Lblock_1x8 - -Lblock_2x8: - mov r9, r0 - vld1.16 {d0[0]}, [r9], r1 - vld1.16 {d1[0]}, [r9], r1 - vld1.16 {d0[1]}, [r9], r1 - vld1.16 {d1[1]}, [r9], r1 - vld1.16 {d0[2]}, [r9], r1 - vld1.16 {d1[2]}, [r9], r1 - vld1.16 {d0[3]}, [r9], r1 - vld1.16 {d1[3]}, [r9] - - vtrn.8 d0, d1 - - mov r9, r2 - - vst1.64 {d0}, [r9], r3 - vst1.64 {d1}, [r9] - - add r0, #2 @ src += 2 - add r2, r3, lsl #1 @ dst += 2 * dst_stride - subs r8, #2 @ w -= 2 - beq Ldone - -Lblock_1x8: - vld1.8 {d0[0]}, [r0], r1 - vld1.8 {d0[1]}, [r0], r1 - vld1.8 {d0[2]}, [r0], r1 - vld1.8 {d0[3]}, [r0], r1 - vld1.8 {d0[4]}, [r0], r1 - vld1.8 {d0[5]}, [r0], r1 - vld1.8 {d0[6]}, [r0], r1 - vld1.8 {d0[7]}, [r0] - - vst1.64 {d0}, [r2] - -Ldone: - - pop {r4,r8,r9,pc} - -vtbl_4x4_transpose: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - -@ void SaveRegisters_NEON (unsigned long long store) -@ r0 unsigned long long store -SaveRegisters_NEON: - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - -@ void RestoreRegisters_NEON (unsigned long long store) -@ r0 unsigned long long store -RestoreRegisters_NEON: - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - -@ void ReverseLineUV_NEON (const uint8* src, -@ uint8* dst_a, -@ uint8* dst_b, -@ int width) -@ r0 const uint8* src -@ r1 uint8* dst_a -@ r2 uint8* dst_b -@ r3 width -ReverseLineUV_NEON: - - @ compute where to start writing destination - add r1, r1, r3 @ dst_a + width - add r2, r2, r3 @ dst_b + width - - @ work on input segments that are multiples of 16, but - @ width that has been passed is output segments, half - @ the size of input. - lsrs r12, r3, #3 - - beq Lline_residuals_di - - @ the output is written in to two blocks. - mov r12, #-8 - - @ back of destination by the size of the register that is - @ going to be reversed - sub r1, r1, #8 - sub r2, r2, #8 - - @ the loop needs to run on blocks of 8. what will be left - @ over is either a negative number, the residuals that need - @ to be done, or 0. if this isn't subtracted off here the - @ loop will run one extra time. - sub r3, r3, #8 - -Lsegments_of_8_di: - vld2.8 {d0, d1}, [r0]! @ src += 16 - - @ reverse the bytes in the 64 bit segments - vrev64.8 q0, q0 - - vst1.8 {d0}, [r1], r12 @ dst_a -= 8 - vst1.8 {d1}, [r2], r12 @ dst_b -= 8 - - subs r3, r3, #8 - bge Lsegments_of_8_di - - @ add 8 back to the counter. if the result is 0 there is no - @ residuals so return - adds r3, r3, #8 - bxeq lr - - add r1, r1, #8 - add r2, r2, #8 - -Lline_residuals_di: - - mov r12, #-1 - - sub r1, r1, #1 - sub r2, r2, #1 - -@ do this in neon registers as per -@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ -Lsegments_of_1: - vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 - - vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1 - vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1 - - subs r3, r3, #1 - bgt Lsegments_of_1 - - bx lr - -@ void TransposeUVWx8_NEON (const uint8* src, int src_stride, -@ uint8* dst_a, int dst_stride_a, -@ uint8* dst_b, int dst_stride_b, -@ int width) -@ r0 const uint8* src -@ r1 int src_stride -@ r2 uint8* dst_a -@ r3 int dst_stride_a -@ stack uint8* dst_b -@ stack int dst_stride_b -@ stack int width -TransposeUVWx8_NEON: - push {r4-r9,lr} - - ldr r4, [sp, #28] @ dst_b - ldr r5, [sp, #32] @ dst_stride_b - ldr r8, [sp, #36] @ width - @ loops are on blocks of 8. loop will stop when - @ counter gets to or below 0. starting the counter - @ at w-8 allow for this - sub r8, #8 - -@ handle 8x8 blocks. this should be the majority of the plane -Lloop_8x8_di: - mov r9, r0 - - vld2.8 {d0, d1}, [r9], r1 - vld2.8 {d2, d3}, [r9], r1 - vld2.8 {d4, d5}, [r9], r1 - vld2.8 {d6, d7}, [r9], r1 - vld2.8 {d8, d9}, [r9], r1 - vld2.8 {d10, d11}, [r9], r1 - vld2.8 {d12, d13}, [r9], r1 - vld2.8 {d14, d15}, [r9] - - vtrn.8 q1, q0 - vtrn.8 q3, q2 - vtrn.8 q5, q4 - vtrn.8 q7, q6 - - vtrn.16 q1, q3 - vtrn.16 q0, q2 - vtrn.16 q5, q7 - vtrn.16 q4, q6 - - vtrn.32 q1, q5 - vtrn.32 q0, q4 - vtrn.32 q3, q7 - vtrn.32 q2, q6 - - vrev16.8 q0, q0 - vrev16.8 q1, q1 - vrev16.8 q2, q2 - vrev16.8 q3, q3 - vrev16.8 q4, q4 - vrev16.8 q5, q5 - vrev16.8 q6, q6 - vrev16.8 q7, q7 - - mov r9, r2 - - vst1.8 {d2}, [r9], r3 - vst1.8 {d0}, [r9], r3 - vst1.8 {d6}, [r9], r3 - vst1.8 {d4}, [r9], r3 - vst1.8 {d10}, [r9], r3 - vst1.8 {d8}, [r9], r3 - vst1.8 {d14}, [r9], r3 - vst1.8 {d12}, [r9] - - mov r9, r4 - - vst1.8 {d3}, [r9], r5 - vst1.8 {d1}, [r9], r5 - vst1.8 {d7}, [r9], r5 - vst1.8 {d5}, [r9], r5 - vst1.8 {d11}, [r9], r5 - vst1.8 {d9}, [r9], r5 - vst1.8 {d15}, [r9], r5 - vst1.8 {d13}, [r9] - - add r0, #8*2 @ src += 8*2 - add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a - add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b - subs r8, #8 @ w -= 8 - bge Lloop_8x8_di - - @ add 8 back to counter. if the result is 0 there are - @ no residuals. - adds r8, #8 - beq Ldone_di - - @ some residual, so between 1 and 7 lines left to transpose - cmp r8, #2 - blt Lblock_1x8_di - - cmp r8, #4 - blt Lblock_2x8_di - -@ TODO(frkoenig) : clean this up -Lblock_4x8_di: - mov r9, r0 - vld1.64 {d0}, [r9], r1 - vld1.64 {d1}, [r9], r1 - vld1.64 {d2}, [r9], r1 - vld1.64 {d3}, [r9], r1 - vld1.64 {d4}, [r9], r1 - vld1.64 {d5}, [r9], r1 - vld1.64 {d6}, [r9], r1 - vld1.64 {d7}, [r9] - - adr r12, vtbl_4x4_transpose_di - vld1.8 {q7}, [r12] - - vtrn.8 q0, q1 - vtrn.8 q2, q3 - - vtbl.8 d8, {d0, d1}, d14 - vtbl.8 d9, {d0, d1}, d15 - vtbl.8 d10, {d2, d3}, d14 - vtbl.8 d11, {d2, d3}, d15 - vtbl.8 d12, {d4, d5}, d14 - vtbl.8 d13, {d4, d5}, d15 - vtbl.8 d0, {d6, d7}, d14 - vtbl.8 d1, {d6, d7}, d15 - - mov r9, r2 - - vst1.32 {d8[0]}, [r9], r3 - vst1.32 {d8[1]}, [r9], r3 - vst1.32 {d9[0]}, [r9], r3 - vst1.32 {d9[1]}, [r9], r3 - - add r9, r2, #4 - vst1.32 {d12[0]}, [r9], r3 - vst1.32 {d12[1]}, [r9], r3 - vst1.32 {d13[0]}, [r9], r3 - vst1.32 {d13[1]}, [r9] - - mov r9, r4 - - vst1.32 {d10[0]}, [r9], r5 - vst1.32 {d10[1]}, [r9], r5 - vst1.32 {d11[0]}, [r9], r5 - vst1.32 {d11[1]}, [r9], r5 - - add r9, r4, #4 - vst1.32 {d0[0]}, [r9], r5 - vst1.32 {d0[1]}, [r9], r5 - vst1.32 {d1[0]}, [r9], r5 - vst1.32 {d1[1]}, [r9] - - add r0, #4*2 @ src += 4 * 2 - add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a - add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b - subs r8, #4 @ w -= 4 - beq Ldone_di - - @ some residual, check to see if it includes a 2x8 block, - @ or less - cmp r8, #2 - blt Lblock_1x8_di - -Lblock_2x8_di: - mov r9, r0 - vld2.16 {d0[0], d2[0]}, [r9], r1 - vld2.16 {d1[0], d3[0]}, [r9], r1 - vld2.16 {d0[1], d2[1]}, [r9], r1 - vld2.16 {d1[1], d3[1]}, [r9], r1 - vld2.16 {d0[2], d2[2]}, [r9], r1 - vld2.16 {d1[2], d3[2]}, [r9], r1 - vld2.16 {d0[3], d2[3]}, [r9], r1 - vld2.16 {d1[3], d3[3]}, [r9] - - vtrn.8 d0, d1 - vtrn.8 d2, d3 - - mov r9, r2 - - vst1.64 {d0}, [r9], r3 - vst1.64 {d2}, [r9] - - mov r9, r4 - - vst1.64 {d1}, [r9], r5 - vst1.64 {d3}, [r9] - - add r0, #2*2 @ src += 2 * 2 - add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a - add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a - subs r8, #2 @ w -= 2 - beq Ldone_di - -Lblock_1x8_di: - vld2.8 {d0[0], d1[0]}, [r0], r1 - vld2.8 {d0[1], d1[1]}, [r0], r1 - vld2.8 {d0[2], d1[2]}, [r0], r1 - vld2.8 {d0[3], d1[3]}, [r0], r1 - vld2.8 {d0[4], d1[4]}, [r0], r1 - vld2.8 {d0[5], d1[5]}, [r0], r1 - vld2.8 {d0[6], d1[6]}, [r0], r1 - vld2.8 {d0[7], d1[7]}, [r0] - - vst1.64 {d0}, [r2] - vst1.64 {d1}, [r4] - -Ldone_di: - pop {r4-r9, pc} - -vtbl_4x4_transpose_di: - .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 |