diff options
author | hayati ayguen <h_ayguen@web.de> | 2020-11-11 23:09:58 +0100 |
---|---|---|
committer | hayati ayguen <h_ayguen@web.de> | 2020-11-11 23:09:58 +0100 |
commit | 29eb8477d4d7cd274b5432737725aadcd3df2767 (patch) | |
tree | c28a0903ffb5c2aad13c78a9cf2f1d0c0b416dfb | |
parent | ee17cb0c97146a888c8d925999413f20934676fb (diff) | |
download | pffft-29eb8477d4d7cd274b5432737725aadcd3df2767.tar.gz |
added initial PFDSP library with mixer, carrier (generation) and cic functions
portions copied from csdr library
Signed-off-by: hayati ayguen <h_ayguen@web.de>
-rw-r--r-- | CMakeLists.txt | 30 | ||||
-rw-r--r-- | bench_mixers.c | 517 | ||||
-rw-r--r-- | fmv.h | 20 | ||||
-rw-r--r-- | pf_carrier.cpp | 298 | ||||
-rw-r--r-- | pf_carrier.h | 75 | ||||
-rw-r--r-- | pf_cic.cpp | 252 | ||||
-rw-r--r-- | pf_cic.h | 58 | ||||
-rw-r--r-- | pf_mixer.cpp | 750 | ||||
-rw-r--r-- | pf_mixer.h | 177 | ||||
-rw-r--r--[-rwxr-xr-x] | use_gcc8.inc | 0 |
10 files changed, 2177 insertions, 0 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index b33e6a7..ec92e24 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,25 @@ set_property(TARGET PFFFT APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ###################################################### +if (USE_TYPE_FLOAT) + + add_library(PFDSP STATIC pf_mixer.cpp pf_mixer.h pf_carrier.cpp pf_carrier.h pf_cic.cpp pf_cic.h fmv.h ) + target_compile_definitions(PFDSP PRIVATE _USE_MATH_DEFINES) + if (USE_DEBUG_ASAN) + target_compile_options(PFDSP PRIVATE "-fsanitize=address") + endif() + if (USE_SIMD AND USE_SIMD_NEON) + target_compile_definitions(PFDSP PRIVATE PFFFT_ENABLE_NEON=1) + target_compile_options(PFDSP PRIVATE "-mfpu=neon") + endif() + target_link_libraries( PFDSP ${MATHLIB} ) + set_property(TARGET PFDSP APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> + ) +endif() + +###################################################### + add_library(FFTPACK STATIC fftpack.c fftpack.h) target_compile_definitions(FFTPACK PRIVATE _USE_MATH_DEFINES) target_link_libraries( FFTPACK ${MATHLIB} ) @@ -282,6 +301,17 @@ endif() ###################################################### +if (USE_TYPE_FLOAT) + add_executable(bench_pf_mixer_float bench_mixers.c) + target_compile_definitions(bench_pf_mixer_float PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(bench_pf_mixer_float PRIVATE PFFFT_ENABLE_FLOAT) + + target_link_libraries( bench_pf_mixer_float PFDSP ${ASANLIB} ) + +endif() + +###################################################### + enable_testing() if (USE_TYPE_FLOAT) diff --git a/bench_mixers.c b/bench_mixers.c new file mode 100644 index 0000000..b659020 --- /dev/null +++ b/bench_mixers.c @@ -0,0 +1,517 @@ +/* + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + bench for mixer algorithm/implementations + + */ + +#include <pf_mixer.h> + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <assert.h> +#include <string.h> + +#define HAVE_SYS_TIMES + +#ifdef HAVE_SYS_TIMES +# include <sys/times.h> +# include <unistd.h> +#endif + +#define BENCH_REF_TRIG_FUNC 1 +#define BENCH_OUT_OF_PLACE_ALGOS 1 +#define BENCH_INPLACE_ALGOS 1 + + +#if defined(HAVE_SYS_TIMES) + static double ttclk = 0.; + + static double uclock_sec(void) + { + struct tms t; + if (ttclk == 0.) + ttclk = sysconf(_SC_CLK_TCK); + times(&t); + /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */ + return ((double)t.tms_utime) / ttclk; + } +# else + double uclock_sec(void) + { return (double)clock()/(double)CLOCKS_PER_SEC; } +#endif + + +void save(complexf * d, int B, int N, const char * fn) +{ + if (!fn) + fn = "/dev/shm/bench.bin"; + FILE * f = fopen(fn, "wb"); + if (!f) { + fprintf(stderr, "error writing result to %s\n", fn); + return; + } + for (int off = 0; off + B <= N; off += B) + { + fwrite(d+off, sizeof(complexf), B, f); + } + fclose(f); +} + + +double bench_shift_math_cc(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_math_cc(input+off, output+off, B, -0.0009F, phase); + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_table_cc(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + int table_size=65536; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + + shift_table_data_t table_data = shift_table_init(table_size); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_table_cc(input+off, output+off, B, -0.0009F, table_data, phase); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_addfast(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + int table_size=65536; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_addfast_data_t state = shift_addfast_init(-0.0009F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_addfast_cc(input+off, output+off, B, &state, phase); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_unroll_oop(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_unroll_data_t state = shift_unroll_init(-0.0009F, B); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_unroll_cc(input+off, output+off, B, &state, phase); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + +double bench_shift_unroll_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_unroll_data_t state = shift_unroll_init(-0.0009F, B); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + phase = shift_unroll_inp_c(input+off, B, &state, phase); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, NULL); + free(input); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + + +double bench_shift_limited_unroll_oop(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_cc(input+off, output+off, B, &state); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(output, B, off, NULL); + free(input); + free(output); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_limited_unroll_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_data_t state = shift_limited_unroll_init(-0.0009F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_inp_c(input+off, B, &state); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, NULL); + free(input); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_limited_unroll_sse_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + shift_limited_unroll_sse_data_t *state = malloc(sizeof(shift_limited_unroll_sse_data_t)); + + *state = shift_limited_unroll_sse_init(-0.0009F, 0.0F); + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_limited_unroll_sse_inp_c(input+off, B, state); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, NULL); + free(input); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_rec_osc_cc_oop(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + complexf *output = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state, shift_state; + shift_recursive_osc_conf_t gen_conf, shift_conf; + + shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state); + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_recursive_osc_cc(input+off, output+off, B, &shift_conf, &shift_state); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + //save(input, B, off, "/dev/shm/bench_inp.bin"); + save(output, B, off, NULL); + free(input); + free(output); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_rec_osc_cc_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state, shift_state; + shift_recursive_osc_conf_t gen_conf, shift_conf; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + shift_recursive_osc_init(-0.0009F, 0.0F, &shift_conf, &shift_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_recursive_osc_inp_c(input+off, B, &shift_conf, &shift_state); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, NULL); + free(input); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + +double bench_shift_rec_osc_sse_c_inp(int B, int N) { + double t0, t1, tstop, T, nI; + int iter, off; + float phase = 0.0F; + complexf *input = (complexf *)malloc(N * sizeof(complexf)); + shift_recursive_osc_t gen_state; + shift_recursive_osc_conf_t gen_conf; + + shift_recursive_osc_sse_t *shift_state = malloc(sizeof(shift_recursive_osc_sse_t)); + shift_recursive_osc_sse_conf_t shift_conf; + + shift_recursive_osc_init(0.001F, 0.0F, &gen_conf, &gen_state); + gen_recursive_osc_c(input, N, &gen_conf, &gen_state); + + shift_recursive_osc_sse_init(-0.0009F, 0.0F, &shift_conf, shift_state); + + iter = 0; + off = 0; + t0 = uclock_sec(); + tstop = t0 + 0.5; /* benchmark duration: 500 ms */ + do { + // work + shift_recursive_osc_sse_inp_c(input+off, B, &shift_conf, shift_state); + + off += B; + ++iter; + t1 = uclock_sec(); + } while ( t1 < tstop && off + B < N ); + + save(input, B, off, NULL); + free(input); + printf("processed %f Msamples\n", off * 1E-6); + T = ( t1 - t0 ); /* duration per fft() */ + nI = ((double)iter) * B; /* number of iterations "normalized" to O(N) = N */ + return (nI / T); /* normalized iterations per second */ +} + + + +int main(int argc, char **argv) +{ + double rt; + + // process up to 64 MSample (512 MByte) in blocks of 8 kSamples (=64 kByte) + int B = 8 * 1024; + int N = 64 * 1024 * 1024; + +#if BENCH_REF_TRIG_FUNC + printf("\nstarting bench of shift_math_cc (out-of-place) with trig functions ..\n"); + rt = bench_shift_math_cc(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); +#endif + +#if BENCH_OUT_OF_PLACE_ALGOS + printf("starting bench of shift_table_cc (out-of-place) ..\n"); + rt = bench_shift_table_cc(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_addfast_cc (out-of-place) ..\n"); + rt = bench_shift_addfast(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("\nstarting bench of shift_unroll_cc (out-of-place) ..\n"); + rt = bench_shift_unroll_oop(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("\nstarting bench of shift_limited_unroll_cc (out-of-place) ..\n"); + rt = bench_shift_limited_unroll_oop(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("\nstarting bench of shift_recursive_osc_cc (out-of-place) ..\n"); + rt = bench_shift_rec_osc_cc_oop(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); +#endif + +#if BENCH_INPLACE_ALGOS + printf("starting bench of shift_unroll_cc in-place ..\n"); + rt = bench_shift_unroll_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_cc in-place ..\n"); + rt = bench_shift_limited_unroll_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_limited_unroll_sse_inp_c in-place ..\n"); + rt = bench_shift_limited_unroll_sse_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_recursive_osc_cc in-place ..\n"); + rt = bench_shift_rec_osc_cc_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + + printf("starting bench of shift_recursive_osc_sse_c in-place ..\n"); + rt = bench_shift_rec_osc_sse_c_inp(B, N); + printf(" %f MSamples/sec\n\n", rt * 1E-6); + +#endif + + return 0; +} + @@ -0,0 +1,20 @@ +#ifndef FMV_H + +#if HAVE_FUNC_ATTRIBUTE_IFUNC +#if defined(__has_attribute) +#if __has_attribute(target_clones) +#if defined(__x86_64) + +// see https://gcc.gnu.org/wiki/FunctionMultiVersioning +#define PF_TARGET_CLONES __attribute__((target_clones("avx","sse4.2","sse3","sse2","sse","default"))) +#define HAVE_PF_TARGET_CLONES 1 +#endif +#endif +#endif +#endif + +#ifndef PF_TARGET_CLONES +#define PF_TARGET_CLONES +#endif + +#endif diff --git a/pf_carrier.cpp b/pf_carrier.cpp new file mode 100644 index 0000000..d751a55 --- /dev/null +++ b/pf_carrier.cpp @@ -0,0 +1,298 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler <randras@sdr.hu> +Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de> + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* include own header first, to see missing includes */ +#include "pf_carrier.h" +#include "fmv.h" + +#include <limits.h> +#include <assert.h> + + +PF_TARGET_CLONES +void generate_dc_f(float* output, int size) +{ + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=(127.0F / 128.0F); + output[i++]=0.0F; + } +} + +PF_TARGET_CLONES +void generate_dc_s16(short* output, int size) +{ + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=SHRT_MAX; + output[i++]=0; + } +} + +PF_TARGET_CLONES +void generate_pos_fs4_f(float* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=(127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0.0F; + output[i++]=(127.0F / 128.0F); + /* exp(i* +pi) = -1+i*0 */ + output[i++]=(-127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0.0F; + output[i++]=(-127.0F / 128.0F); + } +} + +PF_TARGET_CLONES +void generate_pos_fs4_s16(short* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=SHRT_MAX; + output[i++]=0; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0; + output[i++]=SHRT_MAX; + /* exp(i* +pi) = -1+i*0 */ + output[i++]=-SHRT_MAX; + output[i++]=0; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0; + output[i++]=-SHRT_MAX; + } +} + +PF_TARGET_CLONES +void generate_neg_fs4_f(float* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=(127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0.0F; + output[i++]=(-127.0F / 128.0F); + /* exp(i* +pi) = -1+i*0 */ + output[i++]=(-127.0F / 128.0F); + output[i++]=0.0F; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0.0F; + output[i++]=(127.0F / 128.0F); + } +} + +PF_TARGET_CLONES +void generate_neg_fs4_s16(short* output, int size) +{ + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+i*0 */ + output[i++]=SHRT_MAX; + output[i++]=0; + /* exp(i* -pi/2) = 0+i*-1 */ + output[i++]=0; + output[i++]=-SHRT_MAX; + /* exp(i* +pi) = -1+i*0 */ + output[i++]=-SHRT_MAX; + output[i++]=0; + /* exp(i* +pi/2) = 0+i*1 */ + output[i++]=0; + output[i++]=SHRT_MAX; + } +} + +/****************************************************/ + +PF_TARGET_CLONES +void generate_dc_pos_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+1+i*0 */ + output[i++]=m+m; + output[i++]=0; + /* exp(i* +pi/2) = 1+0+i*1 */ + output[i++]=m+0; + output[i++]=m; + /* exp(i* +pi) = 1-1+i*0 */ + output[i++]=m-m; + output[i++]=0; + /* exp(i* -pi/2) = 1+0+i*-1 */ + output[i++]=m; + output[i++]=-m; + } +} + +PF_TARGET_CLONES +void generate_dc_neg_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* exp(i*0) = 1+1+i*0 */ + output[i++]=m+m; + output[i++]=0; + /* exp(i* -pi/2) = 1+0+i*-1 */ + output[i++]=m+0; + output[i++]=-m; + /* exp(i* +pi) = 1-1+i*0 */ + output[i++]=m-m; + output[i++]=0; + /* exp(i* +pi/2) = 1+0+i*1 */ + output[i++]=m+0; + output[i++]=m; + } +} + +PF_TARGET_CLONES +void generate_pos_neg_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* pos(0) + neg(0) = exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */ + output[i++]=m; + output[i++]=-m; + + /* pos(1) + neg(1) = exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */ + output[i++]=-m; + output[i++]=m; + + /* pos(2) + neg(2) = exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */ + output[i++]=-m; + output[i++]=m; + + /* pos(3) + neg(3) = exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */ + output[i++]=m; + output[i++]=-m; + } +} + +PF_TARGET_CLONES +void generate_dc_pos_neg_fs4_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* dc + pos(0) + neg(0) = dc + exp(i* 0 ) + exp(i* 0 ) = 1 +i* 0 + 1 +i* 0 */ + output[i++]=m+m; + output[i++]=-m; + + /* dc + pos(1) + neg(1) = dc + exp(i* +pi/2) + exp(i* -pi/2) = 0 +i* 1 + 0 +i* -1 */ + output[i++]=0; + output[i++]=m; + + /* dc + pos(2) + neg(2) = dc + exp(i* +pi ) + exp(i* +pi ) = -1 +i* 0 + -1 +i* 0 */ + output[i++]=0; + output[i++]=m; + + /* dc + pos(3) + neg(3) = dc + exp(i* -pi/2) + exp(i* +pi/2) = 0 +i* -1 + 0 +i* 1 */ + output[i++]=m+m; + output[i++]=-m; + } +} + + +PF_TARGET_CLONES +void generate_pos_neg_fs2_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* dc + exp(i* 0 ) = +1 */ + output[i++]=m; + output[i++]=0; + /* dc + exp(i* pi) = -1 */ + output[i++]=-m; + output[i++]=0; + /* dc + exp(i* 0 ) = +1 */ + output[i++]=m; + output[i++]=0; + /* dc + exp(i* pi) = -1 */ + output[i++]=-m; + output[i++]=0; + } +} + +PF_TARGET_CLONES +void generate_dc_pos_neg_fs2_s16(short* output, int size) +{ + const int m = SHRT_MAX / 2; + /* size must be multiple of 4 */ + assert(!(size&3)); + for(int i=0;i<2*size;) + { + /* with dc = i*1 */ + /* dc + exp(i* 0 ) = i*1 +1 */ + output[i++]=m; + output[i++]=m; + /* dc + exp(i* pi) = i*1 -1 */ + output[i++]=-m; + output[i++]=m; + /* dc + exp(i* 0 ) = i*1 +1 */ + output[i++]=m; + output[i++]=m; + /* dc + exp(i* pi) = i*1 -1 */ + output[i++]=-m; + output[i++]=m; + } +} + + diff --git a/pf_carrier.h b/pf_carrier.h new file mode 100644 index 0000000..c328ce0 --- /dev/null +++ b/pf_carrier.h @@ -0,0 +1,75 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler <randras@sdr.hu> +Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include <stdio.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + _____ _ + / ____| | | + | | ___ _ __ ___ _ __ | | _____ __ + | | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ / + | |___| (_) | | | | | | |_) | | __/> < + \_____\___/|_| |_| |_| .__/|_|\___/_/\_\ + | | + |_| +*/ + +typedef struct complexf_s { float i; float q; } complexf; + + +/* generation functions */ +void generate_dc_f(float* output, int size); +void generate_dc_s16(short* output, int size); +void generate_pos_fs4_f(float* output, int size); +void generate_pos_fs4_s16(short* output, int size); +void generate_neg_fs4_f(float* output, int size); +void generate_neg_fs4_s16(short* output, int size); + +void generate_dc_pos_fs4_s16(short* output, int size); +void generate_dc_neg_fs4_s16(short* output, int size); +void generate_pos_neg_fs4_s16(short* output, int size); +void generate_dc_pos_neg_fs4_s16(short* output, int size); + +void generate_pos_neg_fs2_s16(short* output, int size); +void generate_dc_pos_neg_fs2_s16(short* output, int size); + + +#ifdef __cplusplus +} +#endif + diff --git a/pf_cic.cpp b/pf_cic.cpp new file mode 100644 index 0000000..34b90c5 --- /dev/null +++ b/pf_cic.cpp @@ -0,0 +1,252 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler <randras@sdr.hu> +Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* include own header first, to see missing includes */ +#include "pf_cic.h" +#include "fmv.h" + +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> + + +/* + ____ ___ ____ ____ ____ ____ + / ___|_ _/ ___| | _ \| _ \ / ___| + | | | | | | | | | | | | | + | |___ | | |___ | |_| | |_| | |___ + \____|___\____| |____/|____/ \____| +*/ + +#define SINESHIFT 12 +#define SINESIZE (1<<SINESHIFT) +typedef int64_t cic_dt; // data type used for integrators and combs +typedef struct { + int factor; + uint64_t phase; + float gain; + cic_dt ig0a, ig0b, ig1a, ig1b; + cic_dt comb0a, comb0b, comb1a, comb1b; + int16_t *sinetable; +} cicddc_t; + +void *cicddc_init(int factor) { + int i; + int sinesize2 = SINESIZE * 5/4; // 25% extra to get cosine from the same table + cicddc_t *s; + s = (cicddc_t *)malloc(sizeof(cicddc_t)); + memset(s, 0, sizeof(cicddc_t)); + + float sineamp = 32767.0f; + s->factor = factor; + s->gain = 1.0f / SHRT_MAX / sineamp / factor / factor / factor; // compensate for gain of 3 integrators + + s->sinetable = (int16_t *)malloc(sinesize2 * sizeof(*s->sinetable)); + double f = 2.0*M_PI / (double)SINESIZE; + for(i = 0; i < sinesize2; i++) { + s->sinetable[i] = sineamp * cos(f * i); + } + return s; +} + +void cicddc_free(void *state) { + cicddc_t *s = (cicddc_t *)state; + free(s->sinetable); + free(s); +} + + +PF_TARGET_CLONES +void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) { + cicddc_t *s = (cicddc_t *)state; + int k; + int factor = s->factor; + cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b; + cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b; + uint64_t phase = s->phase, freq; + int16_t *sinetable = s->sinetable; + float gain = s->gain; + + freq = rate * ((float)(1ULL << 63) * 2); + + int16_t *inp = input; + for(k = 0; k < outsize; k++) { + int i; + cic_dt out0a, out0b, out1a, out1b; + cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum + for(i = 0; i < factor; i++) { + cic_dt in_a, in_b; + int sinep = phase >> (64-SINESHIFT); + in_a = (int32_t)inp[i] * (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))]; + in_b = (int32_t)inp[i] * (int32_t)sinetable[sinep]; + phase += freq; + /* integrators: + The calculations are ordered so that each integrator + takes a result from previous loop iteration + to make the code more "pipeline-friendly". */ + ig2a += ig1a; ig2b += ig1b; + ig1a += ig0a; ig1b += ig0b; + ig0a += in_a; ig0b += in_b; + } + inp += factor; + // comb filters: + out0a = ig2a - comb0a; out0b = ig2b - comb0b; + comb0a = ig2a; comb0b = ig2b; + out1a = out0a - comb1a; out1b = out0b - comb1b; + comb1a = out0a; comb1b = out0b; + + output[k].i = (float)out1a * gain; + output[k].q = (float)out1b * gain; + } + + s->ig0a = ig0a; s->ig0b = ig0b; + s->ig1a = ig1a; s->ig1b = ig1b; + s->comb0a = comb0a; s->comb0b = comb0b; + s->comb1a = comb1a; s->comb1b = comb1b; + s->phase = phase; +} + +PF_TARGET_CLONES +void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate) { + cicddc_t *s = (cicddc_t *)state; + int k; + int factor = s->factor; + cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b; + cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b; + uint64_t phase = s->phase, freq; + int16_t *sinetable = s->sinetable; + float gain = s->gain; + + freq = rate * ((float)(1ULL << 63) * 2); + + int16_t *inp = input; + for(k = 0; k < outsize; k++) { + int i; + cic_dt out0a, out0b, out1a, out1b; + cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum + for(i = 0; i < factor; i++) { + cic_dt in_a, in_b; + int32_t m_a, m_b, m_c, m_d; + int sinep = phase >> (64-SINESHIFT); + m_a = inp[2*i]; + m_b = inp[2*i+1]; + m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))]; + m_d = (int32_t)sinetable[sinep]; + // complex multiplication: + in_a = m_a*m_c - m_b*m_d; + in_b = m_a*m_d + m_b*m_c; + phase += freq; + /* integrators: + The calculations are ordered so that each integrator + takes a result from previous loop iteration + to make the code more "pipeline-friendly". */ + ig2a += ig1a; ig2b += ig1b; + ig1a += ig0a; ig1b += ig0b; + ig0a += in_a; ig0b += in_b; + } + inp += 2*factor; + // comb filters: + out0a = ig2a - comb0a; out0b = ig2b - comb0b; + comb0a = ig2a; comb0b = ig2b; + out1a = out0a - comb1a; out1b = out0b - comb1b; + comb1a = out0a; comb1b = out0b; + + output[k].i = (float)out1a * gain; + output[k].q = (float)out1b * gain; + } + + s->ig0a = ig0a; s->ig0b = ig0b; + s->ig1a = ig1a; s->ig1b = ig1b; + s->comb0a = comb0a; s->comb0b = comb0b; + s->comb1a = comb1a; s->comb1b = comb1b; + s->phase = phase; +} + + +/* This is almost copy paste from cicddc_cs16_c. + I'm afraid this is going to be annoying to maintain... */ +PF_TARGET_CLONES +void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate) { + cicddc_t *s = (cicddc_t *)state; + int k; + int factor = s->factor; + cic_dt ig0a = s->ig0a, ig0b = s->ig0b, ig1a = s->ig1a, ig1b = s->ig1b; + cic_dt comb0a = s->comb0a, comb0b = s->comb0b, comb1a = s->comb1a, comb1b = s->comb1b; + uint64_t phase = s->phase, freq; + int16_t *sinetable = s->sinetable; + float gain = s->gain; + + freq = rate * ((float)(1ULL << 63) * 2); + + uint8_t *inp = input; + for(k = 0; k < outsize; k++) { + int i; + cic_dt out0a, out0b, out1a, out1b; + cic_dt ig2a = 0, ig2b = 0; // last integrator and first comb replaced simply by sum + for(i = 0; i < factor; i++) { + cic_dt in_a, in_b; + int32_t m_a, m_b, m_c, m_d; + int sinep = phase >> (64-SINESHIFT); + // subtract 127.4 (good for rtl-sdr) + m_a = (((int32_t)inp[2*i]) << 8) - 32614; + m_b = (((int32_t)inp[2*i+1]) << 8) - 32614; + m_c = (int32_t)sinetable[sinep + (1<<(SINESHIFT-2))]; + m_d = (int32_t)sinetable[sinep]; + // complex multiplication: + in_a = m_a*m_c - m_b*m_d; + in_b = m_a*m_d + m_b*m_c; + phase += freq; + /* integrators: + The calculations are ordered so that each integrator + takes a result from previous loop iteration + to make the code more "pipeline-friendly". */ + ig2a += ig1a; ig2b += ig1b; + ig1a += ig0a; ig1b += ig0b; + ig0a += in_a; ig0b += in_b; + } + inp += 2*factor; + // comb filters: + out0a = ig2a - comb0a; out0b = ig2b - comb0b; + comb0a = ig2a; comb0b = ig2b; + out1a = out0a - comb1a; out1b = out0b - comb1b; + comb1a = out0a; comb1b = out0b; + + output[k].i = (float)out1a * gain; + output[k].q = (float)out1b * gain; + } + + s->ig0a = ig0a; s->ig0b = ig0b; + s->ig1a = ig1a; s->ig1b = ig1b; + s->comb0a = comb0a; s->comb0b = comb0b; + s->comb1a = comb1a; s->comb1b = comb1b; + s->phase = phase; +} + diff --git a/pf_cic.h b/pf_cic.h new file mode 100644 index 0000000..681ee4f --- /dev/null +++ b/pf_cic.h @@ -0,0 +1,58 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler <randras@sdr.hu> +Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + ____ ___ ____ ____ ____ ____ + / ___|_ _/ ___| | _ \| _ \ / ___| + | | | | | | | | | | | | | + | |___ | | |___ | |_| | |_| | |___ + \____|___\____| |____/|____/ \____| +*/ + +typedef struct complexf_s { float i; float q; } complexf; + +void *cicddc_init(int factor); +void cicddc_free(void *state); +void cicddc_s16_c(void *state, int16_t *input, complexf *output, int outsize, float rate); +void cicddc_cs16_c(void *state, int16_t *input, complexf *output, int outsize, float rate); +void cicddc_cu8_c(void *state, uint8_t *input, complexf *output, int outsize, float rate); + +#ifdef __cplusplus +} +#endif + diff --git a/pf_mixer.cpp b/pf_mixer.cpp new file mode 100644 index 0000000..570ffaf --- /dev/null +++ b/pf_mixer.cpp @@ -0,0 +1,750 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler <randras@sdr.hu> +Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* include own header first, to see missing includes */ +#include "pf_mixer.h" +#include "fmv.h" + +//#include <stdio.h> +//#include <time.h> +#include <math.h> +#include <stdlib.h> +//#include <string.h> +//#include <unistd.h> +//#include <limits.h> +//#include <assert.h> +//#include <stdarg.h> + +//they dropped M_PI in C99, so we define it: +#define PI ((float)3.14159265358979323846) + +//apply to pointers: +#define iof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i))) +#define qof(complexf_input_p,i) (*(((float*)complexf_input_p)+2*(i)+1)) + + +/* + _____ _____ _____ __ _ _ + | __ \ / ____| __ \ / _| | | (_) + | | | | (___ | |__) | | |_ _ _ _ __ ___| |_ _ ___ _ __ ___ + | | | |\___ \| ___/ | _| | | | '_ \ / __| __| |/ _ \| '_ \/ __| + | |__| |____) | | | | | |_| | | | | (__| |_| | (_) | | | \__ \ + |_____/|_____/|_| |_| \__,_|_| |_|\___|\__|_|\___/|_| |_|___/ + +*/ + +PF_TARGET_CLONES +float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase) +{ + rate*=2; + //Shifts the complex spectrum. Basically a complex mixer. This version uses cmath. + float phase=starting_phase; + float phase_increment=rate*PI; + float cosval, sinval; + for(int i=0;i<input_size; i++) + { + cosval=cos(phase); + sinval=sin(phase); + //we multiply two complex numbers. + //how? enter this to maxima (software) for explanation: + // (a+b*%i)*(c+d*%i), rectform; + iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i); + qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i); + phase+=phase_increment; + while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase + while(phase<0) phase+=2*PI; + } + return phase; +} + + + +shift_table_data_t shift_table_init(int table_size) +{ + //RTODO + shift_table_data_t output; + output.table=(float*)malloc(sizeof(float)*table_size); + output.table_size=table_size; + for(int i=0;i<table_size;i++) + { + output.table[i]=sin(((float)i/table_size)*(PI/2)); + } + return output; +} + +void shift_table_deinit(shift_table_data_t table_data) +{ + free(table_data.table); +} + + +PF_TARGET_CLONES +float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase) +{ + //RTODO + rate*=2; + //Shifts the complex spectrum. Basically a complex mixer. This version uses a pre-built sine table. + float phase=starting_phase; + float phase_increment=rate*PI; + float cosval, sinval; + for(int i=0;i<input_size; i++) //@shift_math_cc + { + int sin_index, cos_index, temp_index, sin_sign, cos_sign; + //float vphase=fmodf(phase,PI/2); //between 0 and 90deg + int quadrant=phase/(PI/2); //between 0 and 3 + float vphase=phase-quadrant*(PI/2); + sin_index=(vphase/(PI/2))*table_data.table_size; + cos_index=table_data.table_size-1-sin_index; + if(quadrant&1) //in quadrant 1 and 3 + { + temp_index=sin_index; + sin_index=cos_index; + cos_index=temp_index; + } + sin_sign=(quadrant>1)?-1:1; //in quadrant 2 and 3 + cos_sign=(quadrant&&quadrant<3)?-1:1; //in quadrant 1 and 2 + sinval=sin_sign*table_data.table[sin_index]; + cosval=cos_sign*table_data.table[cos_index]; + //we multiply two complex numbers. + //how? enter this to maxima (software) for explanation: + // (a+b*%i)*(c+d*%i), rectform; + iof(output,i)=cosval*iof(input,i)-sinval*qof(input,i); + qof(output,i)=sinval*iof(input,i)+cosval*qof(input,i); + phase+=phase_increment; + while(phase>2*PI) phase-=2*PI; //@shift_math_cc: normalize phase + while(phase<0) phase+=2*PI; + } + return phase; +} + + +shift_unroll_data_t shift_unroll_init(float rate, int size) +{ + shift_unroll_data_t output; + output.phase_increment=2*rate*PI; + output.size = size; + output.dsin=(float*)malloc(sizeof(float)*size); + output.dcos=(float*)malloc(sizeof(float)*size); + float myphase = 0; + for(int i=0;i<size;i++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + output.dsin[i]=sin(myphase); + output.dcos[i]=cos(myphase); + } + return output; +} + +void shift_unroll_deinit(shift_unroll_data_t* d) +{ + if (d && d->dsin) + { + free(d->dsin); + d->dsin = NULL; + } + if (d && d->dcos) + { + free(d->dcos); + d->dcos = NULL; + } +} + +PF_TARGET_CLONES +float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float cos_start = cos(starting_phase); + float sin_start = sin(starting_phase); + register float cos_val, sin_val; + for(int i=0;i<input_size; i++) //@shift_unroll_cc + { + iof(output,i) = cos_val*iof(input,i) - sin_val*qof(input,i); + qof(output,i) = sin_val*iof(input,i) + cos_val*qof(input,i); + // calculate complex phasor for next iteration + cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i]; + sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i]; + } + starting_phase+=input_size*d->phase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + +PF_TARGET_CLONES +float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase) +{ + float cos_start = cos(starting_phase); + float sin_start = sin(starting_phase); + register float cos_val, sin_val; + for(int i=0;i<size; i++) //@shift_unroll_inp_c + { + register float inp_i = iof(in_out,i); + register float inp_q = qof(in_out,i); + iof(in_out,i) = cos_val*inp_i - sin_val*inp_q; + qof(in_out,i) = sin_val*inp_i + cos_val*inp_q; + // calculate complex phasor for next iteration + cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i]; + sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i]; + } + starting_phase += size * d->phase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + + +#define SHIFT_UNROLL_SIZE CSDR_SHIFT_LIMITED_UNROLL_SIZE +#define SHIFT_LIMITED_SIMD_SZ CSDR_SHIFT_LIMITED_SIMD_SZ + +shift_limited_unroll_data_t shift_limited_unroll_init(float rate) +{ + shift_limited_unroll_data_t output; + output.phase_increment=2*rate*PI; + float myphase = 0; + for(int i=0;i<SHIFT_UNROLL_SIZE;i++) + { + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + output.dcos[i] = cos(myphase); + output.dsin[i] = sin(myphase); + } + output.complex_phase.i = 1.0F; + output.complex_phase.q = 0.0F; + return output; +} + +PF_TARGET_CLONES +void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d) +{ + float cos_start = d->complex_phase.i; + float sin_start = d->complex_phase.q; + register float cos_val = cos_start, sin_val = sin_start; + while (size > 0) //@shift_limited_unroll_cc + { + int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size; + for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ ) + { + for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) + { + iof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) - sin_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j); + qof(output,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*iof(input,SHIFT_LIMITED_SIMD_SZ*i+j) + cos_val*qof(input,SHIFT_LIMITED_SIMD_SZ*i+j); + // calculate complex phasor for next iteration + cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + } + } + input += SHIFT_UNROLL_SIZE; + output += SHIFT_UNROLL_SIZE; + size -= SHIFT_UNROLL_SIZE; + } + d->complex_phase.i = cos_val; + d->complex_phase.q = sin_val; +} + +PF_TARGET_CLONES +void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d) +{ + float inp_i[SHIFT_LIMITED_SIMD_SZ]; + float inp_q[SHIFT_LIMITED_SIMD_SZ]; + float cos_start = d->complex_phase.i; + float sin_start = d->complex_phase.q; + register float cos_val = cos_start, sin_val = sin_start; + while (size > 0) //@shift_limited_unroll_inp_c + { + int N = (size >= SHIFT_UNROLL_SIZE) ? SHIFT_UNROLL_SIZE : size; + for(int i=0;i<N/SHIFT_LIMITED_SIMD_SZ; i++ ) + { + for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) + inp_i[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].i; + for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) + inp_q[j] = in_out[SHIFT_LIMITED_SIMD_SZ*i+j].q; + for(int j=0; j<SHIFT_LIMITED_SIMD_SZ; j++) + { + iof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = cos_val*inp_i[j] - sin_val*inp_q[j]; + qof(in_out,SHIFT_LIMITED_SIMD_SZ*i+j) = sin_val*inp_i[j] + cos_val*inp_q[j]; + // calculate complex phasor for next iteration + cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + } + } + in_out += SHIFT_UNROLL_SIZE; + size -= SHIFT_UNROLL_SIZE; + } + d->complex_phase.i = cos_val; + d->complex_phase.q = sin_val; +} + + +#if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)) + +#include <xmmintrin.h> +typedef __m128 v4sf; + +#if defined(__GNUC__) +# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) +# define RESTRICT __restrict +#elif defined(_MSC_VER) +# define ALWAYS_INLINE(return_type) __forceinline return_type +# define RESTRICT __restrict +#endif + +# define SIMD_SZ 4 + +typedef union v4_union { + __m128 v; + float f[4]; +} v4_union; + +#define VMUL(a,b) _mm_mul_ps(a,b) +#define VADD(a,b) _mm_add_ps(a,b) +#define VSUB(a,b) _mm_sub_ps(a,b) +#define LD_PS1(s) _mm_set1_ps(s) +#define VLOAD_UNALIGNED(ptr) _mm_loadu_ps((const float *)(ptr)) +#define VLOAD_ALIGNED(ptr) _mm_load_ps((const float *)(ptr)) +#define VSTORE_UNALIGNED(ptr, v) _mm_storeu_ps((float*)(ptr), v) +#define VSTORE_ALIGNED(ptr, v) _mm_store_ps((float*)(ptr), v) +#define INTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } +#define UNINTERLEAVE2(in1, in2, out1, out2) { __m128 tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } + +/* num_cplx must be multiple of 4 ! */ +void aligned_vec_complex_mul(int num_cplx, float *dest_a_vec, const float *b_vec) +{ + __m128 inp_re, inp_im; + __m128 inout_re, inout_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT u = (__m128*)dest_a_vec; + const __m128 * RESTRICT v = (const __m128*)b_vec; + const int L = num_cplx / 4; + int k; + for (k=0; k < L; ++k) + { + UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inout_re, inout_im); /* inout_re = all reals; inout_im = all imags */ + UNINTERLEAVE2(VLOAD_ALIGNED(v), VLOAD_ALIGNED(v+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + product_re = VSUB( VMUL(inout_re, inp_re), VMUL(inout_im, inp_im) ); + product_im = VADD( VMUL(inout_im, inp_re), VMUL(inout_re, inp_im) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE_ALIGNED(u, interl_prod_a); + VSTORE_ALIGNED(u+1, interl_prod_b); + u += 2; + v += 2; + } +} + +shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad) +{ + shift_limited_unroll_sse_data_t output; + float myphase; + + output.phase_increment = 2*relative_freq*PI; + + myphase = 0.0F; + for (int i = 0; i < SHIFT_UNROLL_SIZE + SHIFT_LIMITED_SIMD_SZ; i++) + { + output.dcos[i] = cos(myphase); + output.dsin[i] = sin(myphase); + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + + output.dcos_blk = 0.0F; + output.dsin_blk = 0.0F; + + myphase = phase_start_rad; + for (int i = 0; i < SHIFT_LIMITED_SIMD_SZ; i++) + { + output.phase_state_i[i] = cos(myphase); + output.phase_state_q[i] = sin(myphase); + myphase += output.phase_increment; + while(myphase>PI) myphase-=2*PI; + while(myphase<-PI) myphase+=2*PI; + } + return output; +} + + + +void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d) +{ + const __m128 cos_starts = VLOAD_ALIGNED( &d->phase_state_i[0] ); + const __m128 sin_starts = VLOAD_ALIGNED( &d->phase_state_q[0] ); + __m128 cos_vals = cos_starts; + __m128 sin_vals = sin_starts; + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT p_trig_cos_tab; + __m128 * RESTRICT p_trig_sin_tab; + __m128 * RESTRICT u = (__m128*)in_out; + + while (N_cplx) + { + const int NB = (N_cplx >= CSDR_SHIFT_LIMITED_UNROLL_SIZE) ? CSDR_SHIFT_LIMITED_UNROLL_SIZE : N_cplx; + int B = NB; + p_trig_cos_tab = (__m128*)( &d->dcos[0] ); + p_trig_sin_tab = (__m128*)( &d->dsin[0] ); + while (B >= 0) + { + // complex multiplication of 4 complex values from/to in_out[] + // == u[0..3] *= (cos_val[0..3] + i * sin_val[0..3]): + UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + product_re = VSUB( VMUL(inp_re, cos_vals), VMUL(inp_im, sin_vals) ); + product_im = VADD( VMUL(inp_im, cos_vals), VMUL(inp_re, sin_vals) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE_ALIGNED(u, interl_prod_a); + VSTORE_ALIGNED(u+1, interl_prod_b); + u += 2; + // calculate complex phasor for next iteration + // cos_val = cos_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] - sin_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + // sin_val = sin_start * d->dcos[SHIFT_LIMITED_SIMD_SZ*i+j] + cos_start * d->dsin[SHIFT_LIMITED_SIMD_SZ*i+j]; + // cos_val[]/sin_val[] .. can't fade towards 0 inside this while loop :-) + inp_re = VLOAD_ALIGNED(p_trig_cos_tab); + inp_im = VLOAD_ALIGNED(p_trig_sin_tab); + cos_vals = VSUB( VMUL(inp_re, cos_starts), VMUL(inp_im, sin_starts) ); + sin_vals = VADD( VMUL(inp_im, cos_starts), VMUL(inp_re, sin_starts) ); + ++p_trig_cos_tab; + ++p_trig_sin_tab; + B -= 4; + } + N_cplx -= NB; + /* normalize d->phase_state_i[]/d->phase_state_q[], that magnitude does not fade towards 0 ! */ + /* re-use product_re[]/product_im[] for normalization */ + product_re = VADD( VMUL(cos_vals, cos_vals), VMUL(sin_vals, sin_vals) ); + product_im = _mm_rsqrt_ps(product_re); + cos_vals = VMUL(cos_vals, product_im); + sin_vals = VMUL(sin_vals, product_im); + } + VSTORE_ALIGNED( &d->phase_state_i[0], cos_vals ); + VSTORE_ALIGNED( &d->phase_state_q[0], sin_vals ); +} + +#endif + + +shift_addfast_data_t shift_addfast_init(float rate) +{ + shift_addfast_data_t output; + output.phase_increment=2*rate*PI; + for(int i=0;i<4;i++) + { + output.dsin[i]=sin(output.phase_increment*(i+1)); + output.dcos[i]=cos(output.phase_increment*(i+1)); + } + return output; +} + +#if defined NEON_OPTS && defined__arm__ +#pragma message "Manual NEON (arm32) optimizations are ON: we have a faster shift_addfast_cc now." + +// #define HAVE_ADDFAST_CC_IMPL +// removed cpu specific implementation + +#elif defined NEON_OPTS && defined __aarch64__ +#pragma message "Manual NEON (aarch64) optimizations are ON: we have a faster shift_addfast_cc now." + +// #define HAVE_ADDFAST_CC_IMPL +// removed cpu specific implementation + +#endif + +#ifndef HAVE_ADDFAST_CC_IMPL + +#define SADF_L1(j) cos_vals_ ## j = cos_start * dcos_ ## j - sin_start * dsin_ ## j; \ + sin_vals_ ## j = sin_start * dcos_ ## j + cos_start * dsin_ ## j; +#define SADF_L2(j) iof(output,4*i+j)=(cos_vals_ ## j)*iof(input,4*i+j)-(sin_vals_ ## j)*qof(input,4*i+j); \ + qof(output,4*i+j)=(sin_vals_ ## j)*iof(input,4*i+j)+(cos_vals_ ## j)*qof(input,4*i+j); + +PF_TARGET_CLONES +float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase) +{ + //input_size should be multiple of 4 + //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size); + float cos_start=cos(starting_phase); + float sin_start=sin(starting_phase); + float register cos_vals_0, cos_vals_1, cos_vals_2, cos_vals_3, + sin_vals_0, sin_vals_1, sin_vals_2, sin_vals_3, + dsin_0 = d->dsin[0], dsin_1 = d->dsin[1], dsin_2 = d->dsin[2], dsin_3 = d->dsin[3], + dcos_0 = d->dcos[0], dcos_1 = d->dcos[1], dcos_2 = d->dcos[2], dcos_3 = d->dcos[3]; + + for(int i=0;i<input_size/4; i++) //@shift_addfast_cc + { + SADF_L1(0) + SADF_L1(1) + SADF_L1(2) + SADF_L1(3) + SADF_L2(0) + SADF_L2(1) + SADF_L2(2) + SADF_L2(3) + cos_start = cos_vals_3; + sin_start = sin_vals_3; + } + starting_phase+=input_size*d->phase_increment; + while(starting_phase>PI) starting_phase-=2*PI; + while(starting_phase<-PI) starting_phase+=2*PI; + return starting_phase; +} + +#endif + + + +#define SHIFT_REC_SIMD_SZ CSDR_SHIFT_RECURSIVE_SIMD_SZ + +void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state) +{ + // constants for single phase step + float phase_increment_s = rate*PI; + float k1 = tan(0.5*phase_increment_s); + float k2 = 2*k1 /(1 + k1 * k1); + for (int j=1; j<SHIFT_REC_SIMD_SZ; j++) + { + float tmp; + state->u_cos[j] = state->u_cos[j-1]; + state->v_sin[j] = state->v_sin[j-1]; + // small steps + tmp = state->u_cos[j] - k1 * state->v_sin[j]; + state->v_sin[j] += k2 * tmp; + state->u_cos[j] = tmp - k1 * state->v_sin[j]; + } + + // constants for SHIFT_REC_SIMD_SZ times phase step + float phase_increment_b = phase_increment_s * SHIFT_REC_SIMD_SZ; + while(phase_increment_b > PI) phase_increment_b-=2*PI; + while(phase_increment_b < -PI) phase_increment_b+=2*PI; + conf->k1 = tan(0.5*phase_increment_b); + conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1); +} + +void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state) +{ + if (starting_phase != 0.0F) + { + state->u_cos[0] = cos(starting_phase); + state->v_sin[0] = sin(starting_phase); + } + else + { + state->u_cos[0] = 1.0F; + state->v_sin[0] = 0.0F; + } + shift_recursive_osc_update_rate(rate, conf, state); +} + + +PF_TARGET_CLONES +void shift_recursive_osc_cc(const complexf *input, complexf* output, + int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) +{ + float tmp[SHIFT_REC_SIMD_SZ]; + float inp_i[SHIFT_REC_SIMD_SZ]; + float inp_q[SHIFT_REC_SIMD_SZ]; + shift_recursive_osc_t state = *state_ext; + const float k1 = conf->k1; + const float k2 = conf->k2; + for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_cc + { + //we multiply two complex numbers - similar to shift_math_cc + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + { + inp_i[j] = input[SHIFT_REC_SIMD_SZ*i+j].i; + inp_q[j] = input[SHIFT_REC_SIMD_SZ*i+j].q; + } + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + { + iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; + qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; + } + // update complex phasor - like incrementing phase + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + state.v_sin[j] += k2 * tmp[j]; + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; + } + *state_ext = state; +} + +PF_TARGET_CLONES +void shift_recursive_osc_inp_c(complexf* in_out, + int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) +{ + float tmp[SHIFT_REC_SIMD_SZ]; + float inp_i[SHIFT_REC_SIMD_SZ]; + float inp_q[SHIFT_REC_SIMD_SZ]; + shift_recursive_osc_t state = *state_ext; + const float k1 = conf->k1; + const float k2 = conf->k2; + for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@shift_recursive_osc_inp_c + { + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + { + inp_i[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].i; + inp_q[j] = in_out[SHIFT_REC_SIMD_SZ*i+j].q; + } + //we multiply two complex numbers - similar to shift_math_cc + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + { + iof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; + qof(in_out,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; + } + // update complex phasor - like incrementing phase + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + state.v_sin[j] += k2 * tmp[j]; + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; + } + *state_ext = state; +} + +PF_TARGET_CLONES +void gen_recursive_osc_c(complexf* output, + int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state_ext) +{ + float tmp[SHIFT_REC_SIMD_SZ]; + shift_recursive_osc_t state = *state_ext; + const float k1 = conf->k1; + const float k2 = conf->k2; + for(int i=0;i<size/SHIFT_REC_SIMD_SZ; i++) //@gen_recursive_osc_c + { + // output complex oscillator value + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + { + iof(output,SHIFT_REC_SIMD_SZ*i+j) = state.u_cos[j]; + qof(output,SHIFT_REC_SIMD_SZ*i+j) = state.v_sin[j]; + } + // update complex phasor - like incrementing phase + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + state.v_sin[j] += k2 * tmp[j]; + for (int j=0;j<SHIFT_REC_SIMD_SZ;j++) + state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; + } + *state_ext = state; +} + + +#if (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)) + +void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state) +{ + // constants for single phase step + float phase_increment_s = rate*PI; + float k1 = tan(0.5*phase_increment_s); + float k2 = 2*k1 /(1 + k1 * k1); + for (int j=1; j<CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ; j++) + { + float tmp; + state->u_cos[j] = state->u_cos[j-1]; + state->v_sin[j] = state->v_sin[j-1]; + // small steps + tmp = state->u_cos[j] - k1 * state->v_sin[j]; + state->v_sin[j] += k2 * tmp; + state->u_cos[j] = tmp - k1 * state->v_sin[j]; + } + + // constants for CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ times phase step + float phase_increment_b = phase_increment_s * CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ; + while(phase_increment_b > PI) phase_increment_b-=2*PI; + while(phase_increment_b < -PI) phase_increment_b+=2*PI; + conf->k1 = tan(0.5*phase_increment_b); + conf->k2 = 2*conf->k1 / (1 + conf->k1 * conf->k1); +} + + +void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state) +{ + if (starting_phase != 0.0F) + { + state->u_cos[0] = cos(starting_phase); + state->v_sin[0] = sin(starting_phase); + } + else + { + state->u_cos[0] = 1.0F; + state->v_sin[0] = 0.0F; + } + shift_recursive_osc_sse_update_rate(rate, conf, state); +} + + +void shift_recursive_osc_sse_inp_c(complexf* in_out, + int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext) +{ + const __m128 k1 = LD_PS1( conf->k1 ); + const __m128 k2 = LD_PS1( conf->k2 ); + __m128 u_cos = VLOAD_ALIGNED( &state_ext->u_cos[0] ); + __m128 v_sin = VLOAD_ALIGNED( &state_ext->v_sin[0] ); + __m128 inp_re, inp_im; + __m128 product_re, product_im; + __m128 interl_prod_a, interl_prod_b; + __m128 * RESTRICT u = (__m128*)in_out; + + while (N_cplx) + { + //inp_i[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].i; + //inp_q[j] = in_out[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j].q; + UNINTERLEAVE2(VLOAD_ALIGNED(u), VLOAD_ALIGNED(u+1), inp_re, inp_im); /* inp_re = all reals; inp_im = all imags */ + + //we multiply two complex numbers - similar to shift_math_cc + //iof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.u_cos[j] * inp_i[j] - state.v_sin[j] * inp_q[j]; + //qof(in_out,CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ*i+j) = state.v_sin[j] * inp_i[j] + state.u_cos[j] * inp_q[j]; + product_re = VSUB( VMUL(inp_re, u_cos), VMUL(inp_im, v_sin) ); + product_im = VADD( VMUL(inp_im, u_cos), VMUL(inp_re, v_sin) ); + INTERLEAVE2( product_re, product_im, interl_prod_a, interl_prod_b); + VSTORE_ALIGNED(u, interl_prod_a); + VSTORE_ALIGNED(u+1, interl_prod_b); + u += 2; + + // update complex phasor - like incrementing phase + // tmp[j] = state.u_cos[j] - k1 * state.v_sin[j]; + product_re = VSUB( u_cos, VMUL(k1, v_sin) ); + // state.v_sin[j] += k2 * tmp[j]; + v_sin = VADD( v_sin, VMUL(k2, product_re) ); + // state.u_cos[j] = tmp[j] - k1 * state.v_sin[j]; + u_cos = VSUB( product_re, VMUL(k1, v_sin) ); + + N_cplx -= 4; + } + VSTORE_ALIGNED( &state_ext->u_cos[0], u_cos ); + VSTORE_ALIGNED( &state_ext->v_sin[0], v_sin ); +} + +#endif + diff --git a/pf_mixer.h b/pf_mixer.h new file mode 100644 index 0000000..ff647b3 --- /dev/null +++ b/pf_mixer.h @@ -0,0 +1,177 @@ +/* +This software is part of pffft/pfdsp, a set of simple DSP routines. + +Copyright (c) 2014, Andras Retzler <randras@sdr.hu> +Copyright (c) 2020 Hayati Ayguen <h_ayguen@web.de> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include <stdio.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + _____ _ + / ____| | | + | | ___ _ __ ___ _ __ | | _____ __ + | | / _ \| '_ ` _ \| '_ \| |/ _ \ \/ / + | |___| (_) | | | | | | |_) | | __/> < + \_____\___/|_| |_| |_| .__/|_|\___/_/\_\ + | | + |_| +*/ + +typedef struct complexf_s { float i; float q; } complexf; + +// ================================================================================= + +float shift_math_cc(complexf *input, complexf* output, int input_size, float rate, float starting_phase); + + +typedef struct shift_table_data_s +{ + float* table; + int table_size; +} shift_table_data_t; +void shift_table_deinit(shift_table_data_t table_data); +shift_table_data_t shift_table_init(int table_size); +float shift_table_cc(complexf* input, complexf* output, int input_size, float rate, shift_table_data_t table_data, float starting_phase); + + +typedef struct shift_addfast_data_s +{ + float dsin[4]; + float dcos[4]; + float phase_increment; +} shift_addfast_data_t; +shift_addfast_data_t shift_addfast_init(float rate); +float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase); + + +typedef struct shift_unroll_data_s +{ + float* dsin; + float* dcos; + float phase_increment; + int size; +} shift_unroll_data_t; +shift_unroll_data_t shift_unroll_init(float rate, int size); +void shift_unroll_deinit(shift_unroll_data_t* d); +float shift_unroll_cc(complexf *input, complexf* output, int size, shift_unroll_data_t* d, float starting_phase); +float shift_unroll_inp_c(complexf* in_out, int size, shift_unroll_data_t* d, float starting_phase); + + +/* similar to shift_unroll_cc() - but, have fixed and limited precalc size + * idea: smaller cache usage by table + * size must be multiple of CSDR_SHIFT_LIMITED_SIMD (= 4) + */ +#define CSDR_SHIFT_LIMITED_UNROLL_SIZE 64 +#define CSDR_SHIFT_LIMITED_SIMD_SZ 4 +typedef struct shift_limited_unroll_data_s +{ + float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE]; + float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE]; + complexf complex_phase; + float phase_increment; +} shift_limited_unroll_data_t; +shift_limited_unroll_data_t shift_limited_unroll_init(float rate); +/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */ +/* starting_phase for next call is kept internal in state */ +void shift_limited_unroll_cc(const complexf *input, complexf* output, int size, shift_limited_unroll_data_t* d); +void shift_limited_unroll_inp_c(complexf* in_out, int size, shift_limited_unroll_data_t* d); + + +typedef struct shift_limited_unroll_sse_data_s +{ + /* small/limited trig table */ + float dcos[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ]; + float dsin[CSDR_SHIFT_LIMITED_UNROLL_SIZE+CSDR_SHIFT_LIMITED_SIMD_SZ]; + /* 4 times complex phase */ + float phase_state_i[CSDR_SHIFT_LIMITED_SIMD_SZ]; + float phase_state_q[CSDR_SHIFT_LIMITED_SIMD_SZ]; + /* N_cplx_per_block times increment - for future parallel variants */ + float dcos_blk; + float dsin_blk; + /* */ + float phase_increment; +} shift_limited_unroll_sse_data_t; +shift_limited_unroll_sse_data_t shift_limited_unroll_sse_init(float relative_freq, float phase_start_rad); +void shift_limited_unroll_sse_inp_c(complexf* in_out, int N_cplx, shift_limited_unroll_sse_data_t* d); + + + +/* Recursive Quadrature Oscillator functions "recursive_osc" + * see https://www.vicanek.de/articles/QuadOsc.pdf + */ +#define CSDR_SHIFT_RECURSIVE_SIMD_SZ 8 +typedef struct shift_recursive_osc_s +{ + float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SZ]; + float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SZ]; +} shift_recursive_osc_t; + +typedef struct shift_recursive_osc_conf_s +{ + float k1; + float k2; +} shift_recursive_osc_conf_t; + +void shift_recursive_osc_init(float rate, float starting_phase, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t *state); +void shift_recursive_osc_update_rate(float rate, shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); + +/* size must be multiple of CSDR_SHIFT_LIMITED_SIMD_SZ */ +/* starting_phase for next call is kept internal in state */ +void shift_recursive_osc_cc(const complexf *input, complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); +void shift_recursive_osc_inp_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); +void gen_recursive_osc_c(complexf* output, int size, const shift_recursive_osc_conf_t *conf, shift_recursive_osc_t* state); + +#define CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ 4 +typedef struct shift_recursive_osc_sse_s +{ + float u_cos[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ]; + float v_sin[CSDR_SHIFT_RECURSIVE_SIMD_SSE_SZ]; +} shift_recursive_osc_sse_t; + +typedef struct shift_recursive_osc_sse_conf_s +{ + float k1; + float k2; +} shift_recursive_osc_sse_conf_t; + +void shift_recursive_osc_sse_init(float rate, float starting_phase, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t *state); +void shift_recursive_osc_sse_update_rate(float rate, shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state); +void shift_recursive_osc_sse_inp_c(complexf* in_out, int N_cplx, const shift_recursive_osc_sse_conf_t *conf, shift_recursive_osc_sse_t* state_ext); + + +#ifdef __cplusplus +} +#endif + diff --git a/use_gcc8.inc b/use_gcc8.inc index c4535f1..c4535f1 100755..100644 --- a/use_gcc8.inc +++ b/use_gcc8.inc |