aboutsummaryrefslogtreecommitdiff
path: root/silk
diff options
context:
space:
mode:
authorJonathan Lennox <jonathan@vidyo.com>2015-08-03 17:04:27 -0400
committerJean-Marc Valin <jmvalin@jmvalin.ca>2015-09-01 17:21:31 -0400
commit43120f00758219a784f952754f33b9788a8d731b (patch)
treeb1dd751c49c18c6d7c3a65f30be04476cb2903b6 /silk
parentcb0875e07d7cac701b465863f532dc5bb8b0eb59 (diff)
downloadlibopus-43120f00758219a784f952754f33b9788a8d731b.tar.gz
Reorganize x86 SSE intrinsics code.
Enable x86 intrinsics when building in floating-point mode. Support SSE as an arch value. Use RTCD to conditionally enable existing floating-point Celt SSE code. Call functions directly (without RTCD) when their architecture can be presumed. Use SSE4.1 intrinsics optimized code for Silk even in floating-point mode.
Diffstat (limited to 'silk')
-rw-r--r--silk/x86/SigProc_FIX_sse.h17
-rw-r--r--silk/x86/main_sse.h48
-rw-r--r--silk/x86/x86_silk_map.c25
3 files changed, 83 insertions, 7 deletions
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index 9a0e0964..61efa8da 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -45,6 +45,12 @@ void silk_burg_modified_sse4_1(
int arch /* I Run-time architecture */
);
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+ ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+
+#else
+
extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
opus_int32 *res_nrg, /* O Residual energy */
opus_int *res_nrg_Q, /* O Residual energy Q value */
@@ -59,12 +65,22 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
# define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#endif
+
opus_int64 silk_inner_prod16_aligned_64_sse4_1(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
const opus_int len
);
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+
+#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
+ ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+
+#else
+
extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
@@ -75,3 +91,4 @@ extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
#endif
#endif
+#endif
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index f970632c..afd5ec26 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -50,6 +50,15 @@ void silk_VQ_WMat_EC_sse4_1(
opus_int L /* I number of vectors in codebook */
);
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+ mu_Q9, max_gain_Q7, L, arch) \
+ ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
+ mu_Q9, max_gain_Q7, L))
+
+#else
+
extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
opus_int8 *ind, /* O index of best codebook vector */
opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
@@ -69,6 +78,8 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
mu_Q9, max_gain_Q7, L))
+#endif
+
# define OVERRIDE_silk_NSQ
void silk_NSQ_sse4_1(
@@ -89,6 +100,15 @@ void silk_NSQ_sse4_1(
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+ ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#else
+
extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
@@ -112,6 +132,8 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+#endif
+
# define OVERRIDE_silk_NSQ_del_dec
void silk_NSQ_del_dec_sse4_1(
@@ -132,6 +154,15 @@ void silk_NSQ_del_dec_sse4_1(
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
+#if defined OPUS_X86_PRESUME_SSE4_1
+
+#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+ ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+
+#else
+
extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
const silk_encoder_state *psEncC, /* I/O Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
@@ -155,6 +186,8 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
+#endif
+
void silk_noise_shape_quantizer(
silk_nsq_state *NSQ, /* I/O NSQ state */
opus_int signalType, /* I Signal type */
@@ -192,6 +225,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1(
const opus_int16 pIn[]
);
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
+
+#else
+
# define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
@@ -201,6 +239,8 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
# define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
+#endif
+
void silk_warped_LPC_analysis_filter_FIX_sse4_1(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
@@ -211,6 +251,12 @@ void silk_warped_LPC_analysis_filter_FIX_sse4_1(
const opus_int order /* I Filter order (even) */
);
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \
+ ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
+
+#else
+
extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
@@ -224,5 +270,7 @@ extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])
# define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \
((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order))
+#endif
+
# endif
#endif
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
index 6747d101..ad9fef2a 100644
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -35,6 +35,10 @@
#include "pitch.h"
#include "main.h"
+#if !defined(OPUS_X86_PRESUME_SSE4_1)
+
+#if defined(FIXED_POINT)
+
opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
const opus_int16 *inVec1,
const opus_int16 *inVec2,
@@ -42,18 +46,20 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
) = {
silk_inner_prod16_aligned_64_c, /* non-sse */
silk_inner_prod16_aligned_64_c,
+ silk_inner_prod16_aligned_64_c,
MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
- NULL
};
+#endif
+
opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )(
silk_encoder_state *psEncC,
const opus_int16 pIn[]
) = {
silk_VAD_GetSA_Q8_c, /* non-sse */
silk_VAD_GetSA_Q8_c,
+ silk_VAD_GetSA_Q8_c,
MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */
- NULL
};
void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -75,8 +81,8 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
) = {
silk_NSQ_c, /* non-sse */
silk_NSQ_c,
+ silk_NSQ_c,
MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
- NULL
};
void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -94,8 +100,8 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
) = {
silk_VQ_WMat_EC_c, /* non-sse */
silk_VQ_WMat_EC_c,
+ silk_VQ_WMat_EC_c,
MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
- NULL
};
void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -117,10 +123,12 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
) = {
silk_NSQ_del_dec_c, /* non-sse */
silk_NSQ_del_dec_c,
+ silk_NSQ_del_dec_c,
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
- NULL
};
+#if defined(FIXED_POINT)
+
void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )(
opus_int32 state[], /* I/O State [order + 1] */
opus_int32 res_Q2[], /* O Residual signal [length] */
@@ -132,8 +140,8 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )(
) = {
silk_warped_LPC_analysis_filter_FIX_c, /* non-sse */
silk_warped_LPC_analysis_filter_FIX_c,
+ silk_warped_LPC_analysis_filter_FIX_c,
MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */
- NULL
};
void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
@@ -149,6 +157,9 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )(
) = {
silk_burg_modified_c, /* non-sse */
silk_burg_modified_c,
+ silk_burg_modified_c,
MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */
- NULL
};
+
+#endif
+#endif