diff options
Diffstat (limited to 'common/arm/svc/isvc_iquant_itrans_recon_neon.c')
-rw-r--r-- | common/arm/svc/isvc_iquant_itrans_recon_neon.c | 452 |
1 files changed, 0 insertions, 452 deletions
diff --git a/common/arm/svc/isvc_iquant_itrans_recon_neon.c b/common/arm/svc/isvc_iquant_itrans_recon_neon.c index 270adde..8a97fbc 100644 --- a/common/arm/svc/isvc_iquant_itrans_recon_neon.c +++ b/common/arm/svc/isvc_iquant_itrans_recon_neon.c @@ -587,193 +587,6 @@ void isvc_iquant_itrans_recon_4x4_with_res_accumulate_neon( vreinterpret_u32_u8(pred23_un), 1); } -void isvc_iquant_itrans_recon_chroma_4x4_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - - WORD16 i2_rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - int16x4x4_t src_16x4x2; - int16x4x4_t iscal_16x4x2; - int16x4x4_t weigh_16x4x2; - - int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4; - int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4; - int16x4_t rq1_16x4, rq3_16x4; - int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4; - int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8; - int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4; - int16x4x2_t xx0_16x4x2, xx1_16x4x2; - int32x2x2_t x0_32x2x2, x1_32x2x2; - int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4; - - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - int16x8_t rec0, rec1, rec2, rec3; - uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un; - uint8x8_t out0, out1, out2, out3; - - uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff)); - - int16x4_t pos_255_16x4 = vdup_n_s16(((WORD16) UINT8_MAX)); - int16x4_t neg_255_16x4 = vdup_n_s16(-((WORD16) UINT8_MAX)); - int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6); - int32x4_t rnd_fact = vdupq_n_s32(i2_rnd_factor); - - UNUSED(i4_iq_start_idx); - UNUSED(ps_res); - UNUSED(ps_res_pred); - UNUSED(u1_res_accumulate); - - src_16x4x2 = vld4_s16(pi2_src); - iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat); - weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat); - - weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]); - weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]); - weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]); - weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]); - - q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]); - q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]); - q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]); - q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]); - - q0_32x4 = vaddq_s32(q0_32x4, rnd_fact); - q1_32x4 = vaddq_s32(q1_32x4, rnd_fact); - q2_32x4 = vaddq_s32(q2_32x4, rnd_fact); - q3_32x4 = vaddq_s32(q3_32x4, rnd_fact); - - q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4); - q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4); - q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4); - q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4); - - q0_16x4 = vqshrn_n_s32(q0_32x4, 4); - q1_16x4 = vqshrn_n_s32(q1_32x4, 4); - q2_16x4 = vqshrn_n_s32(q2_32x4, 4); - q3_16x4 = vqshrn_n_s32(q3_32x4, 4); - - q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0); - - rq1_16x4 = vshr_n_s16(q1_16x4, 1); - rq3_16x4 = vshr_n_s16(q3_16x4, 1); - - x0_16x4 = vadd_s16(q0_16x4, q2_16x4); - x1_16x4 = vsub_s16(q0_16x4, q2_16x4); - x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); - x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); - - xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); - xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); - xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); - xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); - - /* row 0 to row 3 */ - xx0_16x4x2 = vtrn_s16(xx0_16x4, xx1_16x4); - xx1_16x4x2 = vtrn_s16(xx2_16x4, xx3_16x4); - x0_32x2x2 = - vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[0]), vreinterpret_s32_s16(xx1_16x4x2.val[0])); - x1_32x2x2 = - vzip_s32(vreinterpret_s32_s16(xx0_16x4x2.val[1]), vreinterpret_s32_s16(xx1_16x4x2.val[1])); - - x0_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[0]); - x1_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[0]); - x2_16x4 = vreinterpret_s16_s32(x0_32x2x2.val[1]); - x3_16x4 = vreinterpret_s16_s32(x1_32x2x2.val[1]); - - /* Store Horz transform output into temp */ - vst1_s16(pi2_tmp, x0_16x4); - vst1_s16(pi2_tmp + 4, x1_16x4); - vst1_s16(pi2_tmp + 8, x2_16x4); - vst1_s16(pi2_tmp + 12, x3_16x4); - - /* vertical inverse transform */ - rq1_16x4 = vshr_n_s16(x1_16x4, 1); - rq3_16x4 = vshr_n_s16(x3_16x4, 1); - - xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); - xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); - xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); - xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); - - x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); - x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); - x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); - x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); - - x0_16x4 = vrshr_n_s16(x0_16x4, 6); - x1_16x4 = vrshr_n_s16(x1_16x4, 6); - x2_16x4 = vrshr_n_s16(x2_16x4, 6); - x3_16x4 = vrshr_n_s16(x3_16x4, 6); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - x0_16x4 = vmax_s16(x0_16x4, neg_255_16x4); - x1_16x4 = vmax_s16(x1_16x4, neg_255_16x4); - x2_16x4 = vmax_s16(x2_16x4, neg_255_16x4); - x3_16x4 = vmax_s16(x3_16x4, neg_255_16x4); - - /* Saturate all values > 255 to 255 and retain the rest as it is */ - x0_16x4 = vmin_s16(x0_16x4, pos_255_16x4); - x1_16x4 = vmin_s16(x1_16x4, pos_255_16x4); - x2_16x4 = vmin_s16(x2_16x4, pos_255_16x4); - x3_16x4 = vmin_s16(x3_16x4, pos_255_16x4); - - x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4)); - x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4)); - x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4)); - x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4)); - - pred0_in = vld1_u8((uint8_t *) pu1_pred); - pred1_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride)); - pred2_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride << 1)); - pred3_in = vld1_u8((uint8_t *) pu1_pred + (i4_pred_stride * 3)); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - /* Out pixel = pred + res */ - rec0 = vaddq_s16(pred0, x0_16x8); - rec1 = vaddq_s16(pred1, x1_16x8); - rec2 = vaddq_s16(pred2, x2_16x8); - rec3 = vaddq_s16(pred3, x3_16x8); - - out0 = vld1_u8(pu1_out); - out1 = vld1_u8(pu1_out + i4_out_stride); - out2 = vld1_u8(pu1_out + i4_out_stride * 2); - out3 = vld1_u8(pu1_out + i4_out_stride * 3); - - /* Convert to 8 bit unsigned with saturation */ - rec0_un = vqmovun_s16(rec0); - rec1_un = vqmovun_s16(rec1); - rec2_un = vqmovun_s16(rec2); - rec3_un = vqmovun_s16(rec3); - - /* Store in alternate postions */ - out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0); - out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1); - out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2); - out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3); - - vst1_u8((pu1_out), out0); - vst1_u8((pu1_out + i4_out_stride), out1); - vst1_u8((pu1_out + (i4_out_stride << 1)), out2); - vst1_u8((pu1_out + ((i4_out_stride << 1) + i4_out_stride)), out3); -} - void isvc_iquant_itrans_recon_chroma_4x4_with_res_output_neon( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, buffer_container_t *ps_res, buffer_container_t *ps_rec, @@ -1280,271 +1093,6 @@ void isvc_iquant_itrans_recon_4x4_dc_neon(buffer_container_t *ps_src, buffer_con vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0); } -void isvc_iquant_itrans_recon_4x4_dc_with_res_output_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_res_stride = ps_res->i4_data_stride; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - WORD16 i2_it_out; - WORD32 i4_iq_out_temp; - int16x8_t temp_0; - int16x4_t residue_res; - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - - UNUSED(pi2_tmp); - UNUSED(ps_res_pred); - UNUSED(u1_res_accumulate); - - if(i4_iq_start_idx == 0) - { - i4_iq_out_temp = pi2_src[0]; - INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); - } - else - { - i4_iq_out_temp = pi2_dc_src[0]; - } - - i2_it_out = ((i4_iq_out_temp + 32) >> 6); - temp_0 = vdupq_n_s16(i2_it_out); - residue_res = vdup_n_s16(isvc_get_residue(i2_it_out, 0, 0)); - - vst1_s16(pi2_res, residue_res); - vst1_s16(pi2_res + i4_res_stride, residue_res); - vst1_s16(pi2_res + (i4_res_stride << 1), residue_res); - vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, residue_res); - - pred0_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred1_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred2_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred3_in = vld1_u8(pu1_pred); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - /* Out pixel = Res + pred */ - pred0 = vaddq_s16(pred0, temp_0); - pred1 = vaddq_s16(pred1, temp_0); - pred2 = vaddq_s16(pred2, temp_0); - pred3 = vaddq_s16(pred3, temp_0); - - /* Convert to unsigned 8 bit with saturation */ - pred0_in = vqmovun_s16(pred0); - pred1_in = vqmovun_s16(pred1); - pred2_in = vqmovun_s16(pred2); - pred3_in = vqmovun_s16(pred3); - - vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred1_in), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 2), vreinterpret_u32_u8(pred2_in), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride * 3), vreinterpret_u32_u8(pred3_in), 0); -} - -void isvc_iquant_itrans_recon_4x4_dc_with_res_accumulate_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - WORD16 *pi2_res = (WORD16 *) ps_res->pv_data; - WORD16 *pi2_res_pred = (WORD16 *) ps_res_pred->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_res_stride = ps_res->i4_data_stride; - WORD32 i4_res_pred_stride = ps_res_pred->i4_data_stride; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; - - WORD32 i4_iq_out_temp; - int16x4_t temp_0; - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - int16x8_t pred01_in, pred23_in; - uint8x8_t pred01_un, pred23_un; - - int16x4_t resd0_in, resd1_in, resd2_in, resd3_in; - int16x8_t resd01_in, resd23_in; - int16x4_t pos_255 = vdup_n_s16(((WORD16) UINT8_MAX)); - int16x4_t neg_255 = vdup_n_s16(-((WORD16) UINT8_MAX)); - - UNUSED(pi2_tmp); - UNUSED(u1_res_accumulate); - - if(i4_iq_start_idx == 0) - { - i4_iq_out_temp = pi2_src[0]; - INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); - } - else - { - i4_iq_out_temp = pi2_dc_src[0]; - } - - temp_0 = vdup_n_s16((i4_iq_out_temp + 32) >> 6); - - resd0_in = vld1_s16((int16_t *) pi2_res_pred); - resd1_in = vld1_s16((int16_t *) pi2_res_pred + i4_res_pred_stride); - resd2_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 2)); - resd3_in = vld1_s16((int16_t *) pi2_res_pred + (i4_res_pred_stride * 3)); - - /* Add res pred to the res obtained */ - resd0_in = vadd_s16(resd0_in, temp_0); - resd1_in = vadd_s16(resd1_in, temp_0); - resd2_in = vadd_s16(resd2_in, temp_0); - resd3_in = vadd_s16(resd3_in, temp_0); - - /* Saturate all values < -255 to -255 and retain the rest as it is */ - resd0_in = vmax_s16(resd0_in, neg_255); - resd1_in = vmax_s16(resd1_in, neg_255); - resd2_in = vmax_s16(resd2_in, neg_255); - resd3_in = vmax_s16(resd3_in, neg_255); - - /* Saturate all values > 255 to 255 and retain the rest as it is */ - resd0_in = vmin_s16(resd0_in, pos_255); - resd1_in = vmin_s16(resd1_in, pos_255); - resd2_in = vmin_s16(resd2_in, pos_255); - resd3_in = vmin_s16(resd3_in, pos_255); - - vst1_s16(pi2_res, resd0_in); - vst1_s16(pi2_res + i4_res_stride, resd1_in); - vst1_s16(pi2_res + (i4_res_stride << 1), resd2_in); - vst1_s16(pi2_res + (i4_res_stride << 1) + i4_res_stride, resd3_in); - - resd01_in = vcombine_s16(resd0_in, resd1_in); - resd23_in = vcombine_s16(resd2_in, resd3_in); - - pred0_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred1_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred2_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred3_in = vld1_u8(pu1_pred); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - pred01_in = vcombine_s16(vget_low_s16(pred0), vget_low_s16(pred1)); - pred23_in = vcombine_s16(vget_low_s16(pred2), vget_low_s16(pred3)); - - /* Out pixel = Res + pred */ - pred01_in = vaddq_s16(pred01_in, resd01_in); - pred23_in = vaddq_s16(pred23_in, resd23_in); - - /* Convert to unsigned 8 bit with saturation */ - pred01_un = vqmovun_s16(pred01_in); - pred23_un = vqmovun_s16(pred23_in); - - vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0); - vst1_lane_u32((uint32_t *) (pu1_out + i4_out_stride), vreinterpret_u32_u8(pred01_un), 1); - vst1_lane_u32((uint32_t *) (pu1_out + (i4_out_stride << 1)), vreinterpret_u32_u8(pred23_un), 0); - vst1_lane_u32((uint32_t *) (pu1_out + ((i4_out_stride << 1) + i4_out_stride)), - vreinterpret_u32_u8(pred23_un), 1); -} - -void isvc_iquant_itrans_recon_chroma_4x4_dc_neon( - buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, - buffer_container_t *ps_res, buffer_container_t *ps_rec, - iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants, WORD16 *pi2_tmp, WORD16 *pi2_dc_src, - WORD32 i4_iq_start_idx, UWORD8 u1_res_accumulate) -{ - WORD16 *pi2_src = (WORD16 *) ps_src->pv_data; - UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data; - UWORD8 *pu1_out = (UWORD8 *) ps_rec->pv_data; - WORD32 i4_pred_stride = ps_pred->i4_data_stride; - WORD32 i4_out_stride = ps_rec->i4_data_stride; - const UWORD16 *pu2_iscal_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat; - const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat; - UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6; - - WORD32 i4_iq_out_temp; - int16x8_t temp_0; - uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in; - int16x8_t pred0, pred1, pred2, pred3; - uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3; - uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff)); - - UNUSED(pi2_src); - UNUSED(pu2_iscal_mat); - UNUSED(pu2_weigh_mat); - UNUSED(u4_qp_div_6); - UNUSED(pi2_tmp); - UNUSED(i4_iq_start_idx); - UNUSED(ps_res); - UNUSED(ps_res_pred); - UNUSED(u1_res_accumulate); - - i4_iq_out_temp = pi2_dc_src[0]; - temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6); - - pred0_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred1_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred2_in = vld1_u8(pu1_pred); - pu1_pred = pu1_pred + i4_pred_stride; - pred3_in = vld1_u8(pu1_pred); - - pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in)); - pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in)); - pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in)); - pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in)); - - /* Out pixel = Res + pred */ - pred0 = vaddq_s16(pred0, temp_0); - pred1 = vaddq_s16(pred1, temp_0); - pred2 = vaddq_s16(pred2, temp_0); - pred3 = vaddq_s16(pred3, temp_0); - - /* Convert to unsigned 8 bit with saturation */ - pred0_in = vqmovun_s16(pred0); - pred1_in = vqmovun_s16(pred1); - pred2_in = vqmovun_s16(pred2); - pred3_in = vqmovun_s16(pred3); - - i4_out_horz_8x8_r0 = vld1_u8(pu1_out); - i4_out_horz_8x8_r1 = vld1_u8(pu1_out + i4_out_stride); - i4_out_horz_8x8_r2 = vld1_u8(pu1_out + i4_out_stride * 2); - i4_out_horz_8x8_r3 = vld1_u8(pu1_out + i4_out_stride * 3); - - /* Store out pixels in alternate positions */ - i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0); - i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1); - i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2); - i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3); - - vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0); - vst1_u8((uint8_t *) (pu1_out + i4_out_stride), i4_out_horz_8x8_r1); - vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 2), i4_out_horz_8x8_r2); - vst1_u8((uint8_t *) (pu1_out + i4_out_stride * 3), i4_out_horz_8x8_r3); -} - void isvc_iquant_itrans_recon_chroma_4x4_dc_with_res_output_neon( buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_res_pred, buffer_container_t *ps_res, buffer_container_t *ps_rec, |