init
This commit is contained in:
8
lib_xcore_math/examples/CMakeLists.txt
Normal file
8
lib_xcore_math/examples/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
|
||||
project(lib_xcore_math_examples)
|
||||
|
||||
add_subdirectory(app_bfp_demo)
|
||||
add_subdirectory(app_fft_demo)
|
||||
add_subdirectory(app_filter_demo)
|
||||
add_subdirectory(app_vect_demo)
|
||||
37
lib_xcore_math/examples/app_bfp_demo/CMakeLists.txt
Normal file
37
lib_xcore_math/examples/app_bfp_demo/CMakeLists.txt
Normal file
@@ -0,0 +1,37 @@
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
|
||||
project(app_bfp_demo)
|
||||
|
||||
set(APP_HW_TARGET XK-EVK-XU316)
|
||||
|
||||
set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
|
||||
|
||||
set(APP_COMPILER_FLAGS -Os)
|
||||
|
||||
if(NOT BUILD_NATIVE)
|
||||
list(APPEND APP_COMPILER_FLAGS -fxscope
|
||||
-report
|
||||
)
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
list(APPEND APP_COMPILER_FLAGS -Werror
|
||||
-g
|
||||
)
|
||||
else()
|
||||
list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
|
||||
# Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
|
||||
# See online help for details.
|
||||
-D_CRT_SECURE_NO_WARNINGS
|
||||
# Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
|
||||
/wd5045
|
||||
)
|
||||
endif()
|
||||
|
||||
if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
|
||||
list(APPEND APP_COMPILER_FLAGS -lm)
|
||||
endif()
|
||||
|
||||
XMOS_REGISTER_APP()
|
||||
247
lib_xcore_math/examples/app_bfp_demo/src/main.c
Normal file
247
lib_xcore_math/examples/app_bfp_demo/src/main.c
Normal file
@@ -0,0 +1,247 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __XS3A__
|
||||
# include <xscope.h>
|
||||
#endif
|
||||
|
||||
#include "xcore_math.h"
|
||||
|
||||
void bfp_s32_example();
|
||||
|
||||
#define RAND_SEED 0xDECAFFED
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
#ifdef __XS3A__
|
||||
xscope_config_io(XSCOPE_IO_BASIC);
|
||||
#endif
|
||||
|
||||
// Seed the random number generator, using a constant for reproducibility
|
||||
srand(RAND_SEED);
|
||||
|
||||
// This example app only demonstrates the real 32-bit BFP functions, but the other three main categories of
|
||||
// arithmetic BFP functions, real 16-bit, complex 32-bit and complex 16-bit, all work in a very similar way.
|
||||
|
||||
// The slight caveat is that in a complex 32-bit BFP vector the real and imaginary parts of a given element are
|
||||
// stored in adjacent memory locations, in a complex 16-bit BFP vector the real and imaginary parts are stored in
|
||||
// separate buffers.
|
||||
bfp_s32_example();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// Prints a BFP vector both in its mantissa-exponent form and as a floating point vector. Also prints headroom.
|
||||
static void print_vector(
|
||||
const bfp_s32_t* vect,
|
||||
const char* name,
|
||||
const unsigned line_num)
|
||||
{
|
||||
printf("=== From line #%u ===\n", line_num);
|
||||
|
||||
// First, the raw mantissas
|
||||
printf("%s = [", name);
|
||||
for(unsigned int k = 0; k < vect->length; k++)
|
||||
printf("%ld, ", (long int)vect->data[k]);
|
||||
printf("] * 2**(%d)\n", vect->exp);
|
||||
|
||||
// Next, the float equivalent
|
||||
printf("%s_float = [", name);
|
||||
for(unsigned int k = 0; k < vect->length; k++)
|
||||
printf("%0.07f, ", ldexp(vect->data[k], vect->exp));
|
||||
printf("]\n");
|
||||
|
||||
// And the headroom
|
||||
printf("%s headroom: %u\n\n", name, vect->hr);
|
||||
}
|
||||
|
||||
#define PRINT_VECT(X) print_vector(&X, #X, __LINE__)
|
||||
#define PRINT_SCALAR(X) printf("%s = %0.07f\n\n", #X, ldexp((double) X.mant, X.exp))
|
||||
|
||||
|
||||
// The length (in elements) of our BFP vectors. We're using a small length to keep the text output relatively tidy.
|
||||
#define LENGTH (4)
|
||||
|
||||
|
||||
|
||||
void bfp_s32_example()
|
||||
{
|
||||
|
||||
printf("###################################\n");
|
||||
printf("### 32-bit block floating-point ###\n");
|
||||
printf("###################################\n\n");
|
||||
|
||||
// Let's use 3 BFP vectors, A, B and C for this example. For real 32-bit BFP vectors, we use the type bfp_s32_t.
|
||||
bfp_s32_t A, B, C;
|
||||
|
||||
// The bfp_s32_t type does not allocate its own buffer for the mantissa vector (nor does the initialization function).
|
||||
// Instead, it contains a pointer that points to the backing buffer.
|
||||
int32_t buffer_A[LENGTH];
|
||||
int32_t buffer_B[LENGTH];
|
||||
int32_t buffer_C[LENGTH];
|
||||
|
||||
// With the space allocated, we can initialize our BFP vectors.
|
||||
// bfp_s32_init() takes:
|
||||
// - A pointer to the bfp_s32_t to be initialized
|
||||
// - The address of the buffer which backs the vector
|
||||
// - The initial exponent of the vector
|
||||
// - The length of the vector, in elements
|
||||
// - a boolean indicating whether or not the headroom should be immediately calculated.
|
||||
printf("Initializing BFP vectors A, B and C\n");
|
||||
bfp_s32_init(&A, buffer_A, -31, LENGTH, 0);
|
||||
bfp_s32_init(&B, buffer_B, -31, LENGTH, 0);
|
||||
bfp_s32_init(&C, buffer_C, -31, LENGTH, 0);
|
||||
|
||||
// Above we specified an initial exponent of -31. For a signed 32-bit value, which can hold integers between
|
||||
// (and including) -(2^31) and (2^31)-1, an exponent of -31 means the representable range is -1.0 to 1.0,
|
||||
// including the former, but not the latter.
|
||||
|
||||
// Above we also instructed bfp_s32_init() NOT to compute the headroom. This is because for B and C we're about
|
||||
// to randomly set the mantissas, invalidating whatever headroom was calculated. A, on the other hand, will first
|
||||
// be used as an output vector, so its exponent, headroom and mantissa data will be clobbered.
|
||||
|
||||
// This is because we're going to fill them with random data next, so their current values are irrelevant.
|
||||
|
||||
// Fill vector's B and C with random data.
|
||||
for(int k = 0; k < LENGTH; k++){
|
||||
B.data[k] = rand();
|
||||
C.data[k] = rand();
|
||||
}
|
||||
|
||||
// Last thing before we print out the initial values of vectors B and C will be to force a recalculation of their
|
||||
// headroom with this random data. A call to bfp_s32_headroom() accomplishes this.
|
||||
printf("\nUpdating headroom of vectors B and C.\n");
|
||||
bfp_s32_headroom(&B);
|
||||
bfp_s32_headroom(&C);
|
||||
|
||||
// Now print out the initial values for B and C.
|
||||
{
|
||||
printf("Initial values for BFP vectors B and C:\n");
|
||||
PRINT_VECT(B);
|
||||
PRINT_VECT(C);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// Let's start by applying a right-shift to the values of B using bfp_s32_shl() (with a negative shift). This isn't
|
||||
// necessary for the calls that follow, we can at least watch what happens to the mantissas, exponent and headroom.
|
||||
{
|
||||
printf("A[k] <-- B[k] >> 10\n");
|
||||
bfp_s32_shl(&A, &B, -10);
|
||||
PRINT_VECT(A);
|
||||
// Applying a -10 bit left-shift effectively divides the vector by a factor of 1024. It's worth pointing out that
|
||||
// because we're doing this in block floating-point, it would have been (nearly) equivalent to simply reduce the
|
||||
// exponent of B by 10.
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// Now let's try adding together vectors B and C element-wise.
|
||||
{
|
||||
printf("A[k] <-- B[k] + C[k]\n");
|
||||
bfp_s32_add(&A, &B, &C);
|
||||
PRINT_VECT(A);
|
||||
|
||||
// Note that the above call to bfp_s32_add() takes 3 arguments. The first is the BFP vector into which the result
|
||||
// will be placed, the second and third are the two input vectors. All three bfp_s32_t's must have already been
|
||||
// initialized (otherwise they won't point to a valid data buffer), although the mantissas, exponent and headroom
|
||||
// of the output vector will be clobbered.
|
||||
// Most BFP functions in this library are very similar, with the first (non-const) bfp_s32_t* being the output vector
|
||||
// and the subsequent bfp_s32_t pointers being the input vectors.
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// In the previous step, we updated A with the sum of vector's B and C. But, if we have no need to keep the contents
|
||||
// of B around (e.g. it's needed for a subsequent stage), we may be able to forego the memory cost of a third
|
||||
// vector (A) just for the output by updating B in-place.
|
||||
{
|
||||
printf("B[k] <-- B[k] + C[k]\n");
|
||||
bfp_s32_add(&B, &B, &C);
|
||||
PRINT_VECT(B);
|
||||
|
||||
// Except where explicitly stated otherwise, the element-wise BFP functions in this library (which take a
|
||||
// pointer to an output vector) can safely update one or the other of the input vectors in-place.
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// Element-wise multiplication works just like addition:
|
||||
{
|
||||
printf("A[k] <-- B[k] * C[k]\n");
|
||||
bfp_s32_mul(&A, &B, &C);
|
||||
PRINT_VECT(A);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// There are several BFP functions that take only a single input vector:
|
||||
|
||||
// Rectify a vector so that all elements are non-negative
|
||||
{
|
||||
printf("B[k] <-- max{ A[k], 0 }\n");
|
||||
bfp_s32_rect(&B, &A);
|
||||
PRINT_VECT(B);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// Take the square root of each element of a vector
|
||||
{
|
||||
printf("A[k] <-- sqrt(B[k])\n");
|
||||
bfp_s32_sqrt(&A, &B);
|
||||
PRINT_VECT(A);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// There are others that take a vector and a scalar as inputs, like bfp_s32_scale()
|
||||
{
|
||||
float_s32_t alpha = {
|
||||
.mant = 0xBADDA7E5, //mantissa
|
||||
.exp = -23 //exponent
|
||||
};
|
||||
printf("C[k] <-- C[k] * %0.07f\n", ldexp(alpha.mant, alpha.exp));
|
||||
bfp_s32_scale(&C, &C, alpha);
|
||||
PRINT_VECT(C);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// And still others that produce a scalar, rather than a vector
|
||||
|
||||
// Inner product of vectors B and C
|
||||
{
|
||||
printf("alpha <-- sum{ B[k] * C[k] } (inner product)\n");
|
||||
float_s64_t alpha = bfp_s32_dot(&B, &C);
|
||||
PRINT_SCALAR(alpha);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// Maximum element of a BFP vector
|
||||
{
|
||||
printf("alpha <-- max{ B[0], B[1], ... B[LENGTH-1] }\n");
|
||||
float_s32_t alpha = bfp_s32_max(&B);
|
||||
PRINT_SCALAR(alpha);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
// Mean value of a BFP vector
|
||||
{
|
||||
printf("alpha <-- mean{ A[0], A[1], ..., A[LENGTH-1] }\n");
|
||||
float_s32_t alpha = bfp_s32_mean(&A);
|
||||
PRINT_SCALAR(alpha);
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
41
lib_xcore_math/examples/app_fft_demo/CMakeLists.txt
Normal file
41
lib_xcore_math/examples/app_fft_demo/CMakeLists.txt
Normal file
@@ -0,0 +1,41 @@
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
|
||||
project(app_fft_demo)
|
||||
|
||||
if(DEFINED XCORE_TARGET)
|
||||
set(APP_HW_TARGET ${XCORE_TARGET})
|
||||
else()
|
||||
set(APP_HW_TARGET XK-EVK-XU316)
|
||||
endif()
|
||||
|
||||
set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
|
||||
|
||||
set(APP_COMPILER_FLAGS -Os)
|
||||
|
||||
if(NOT BUILD_NATIVE)
|
||||
list(APPEND APP_COMPILER_FLAGS -fxscope
|
||||
-report
|
||||
)
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
list(APPEND APP_COMPILER_FLAGS -Werror
|
||||
-g
|
||||
)
|
||||
else()
|
||||
list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
|
||||
# Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
|
||||
# See online help for details.
|
||||
-D_CRT_SECURE_NO_WARNINGS
|
||||
# Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
|
||||
/wd5045
|
||||
)
|
||||
endif()
|
||||
|
||||
if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
|
||||
list(APPEND APP_COMPILER_FLAGS -lm)
|
||||
endif()
|
||||
|
||||
XMOS_REGISTER_APP()
|
||||
453
lib_xcore_math/examples/app_fft_demo/src/main.c
Normal file
453
lib_xcore_math/examples/app_fft_demo/src/main.c
Normal file
@@ -0,0 +1,453 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __XS3A__
|
||||
# include <xscope.h>
|
||||
#endif
|
||||
|
||||
#include "xcore_math.h"
|
||||
|
||||
void fft_mono_example();
|
||||
void fft_stereo_example();
|
||||
void fft_complex_example();
|
||||
|
||||
#define RAND_SEED 3457345734
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
#ifdef __XS3A__
|
||||
xscope_config_io(XSCOPE_IO_BASIC);
|
||||
#endif
|
||||
|
||||
// Seed the random number generator, using a constant for reproducibility
|
||||
srand(RAND_SEED);
|
||||
|
||||
fft_mono_example();
|
||||
fft_stereo_example();
|
||||
fft_complex_example();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// The length (in elements) of our BFP vectors.
|
||||
#define LENGTH (128)
|
||||
|
||||
|
||||
void fft_mono_example()
|
||||
{
|
||||
|
||||
printf("\n\n\n");
|
||||
printf("################\n");
|
||||
printf("### Mono FFT ###\n");
|
||||
printf("################\n\n");
|
||||
|
||||
/*
|
||||
This function demonstrates how apply the forward and inverse FFT to a single channel of
|
||||
real data. This is accomplished using the bfp_fft_forward_mono() and bfp_fft_inverse_mono()
|
||||
functions respectively.
|
||||
|
||||
bfp_s32_t --bfp_fft_forward_mono()--> bfp_complex_s32_t
|
||||
bfp_complex_s32_t --bfp_fft_inverse_mono()--> bfp_s32_t
|
||||
*/
|
||||
|
||||
// bfp_fft_forward_mono() requires an input BFP vector of type bfp_s32_t, whose mantissa vector
|
||||
// is backed by an int32_t array.
|
||||
// The +2 below is so spectrum unpacking can be demonstrated (more on this below).
|
||||
int32_t buffer[LENGTH+2];
|
||||
|
||||
// We'll just fill that vector with random mantissas.
|
||||
for(int k = 0; k < LENGTH; k++){
|
||||
buffer[k] = rand() << 1;
|
||||
}
|
||||
|
||||
// Before doing the forward FFT the input vector needs to be initialized. In many situations there
|
||||
// is no obvious natural exponent to associate with the time-domain signal (such as PCM audio
|
||||
// streams). In such a situation, it is often convenient to select an exponent that normalizes the
|
||||
// data to a known range, such as [-1.0, 1.0). For 32-bit signed data, an exponent of -31 does
|
||||
// that.
|
||||
// Note that the final parameter is instructing the init function to compute the headroom of the
|
||||
// input vector. If we instead chose to fill buffer[] with random data *after* initializing x,
|
||||
// there would be no point to computing the headroom here, as it is invalidated the moment we
|
||||
// modify x.data[].
|
||||
bfp_s32_t x;
|
||||
bfp_s32_init(&x, buffer, -31, LENGTH, 1);
|
||||
|
||||
// Print out the floating point equivalent values of the input vector's elements prior to applying
|
||||
// the FFT.
|
||||
printf("x = [");
|
||||
for(unsigned int k = 0; k < x.length; k++)
|
||||
printf("%0.04f, ", ldexp(x.data[k], x.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
// bfp_fft_forward_mono() operates on data in-place. We'll print out the buffer address before
|
||||
// and after the transformation to convince ourselves of this.
|
||||
#ifdef __xcore__
|
||||
printf("&x.data[0] --> 0x%08X\n", (unsigned) &x.data[0]);
|
||||
#endif
|
||||
printf("x.length --> %u\n\n", x.length);
|
||||
|
||||
|
||||
// Apply the FFT.
|
||||
// This function takes a pointer to the input (time-domain, usually) BFP vector, and returns
|
||||
// a pointer *to the same address* recast to a complex BFP vector.
|
||||
bfp_complex_s32_t* X = bfp_fft_forward_mono(&x);
|
||||
|
||||
/*
|
||||
There are two important things to be aware of with the mono FFT.
|
||||
|
||||
1) The complex output vector X is only half the length of the input vector.
|
||||
2) The real part of the Nyquist rate component of the FFT is packed into the first element of X.
|
||||
|
||||
The DFT of an N-element signal is periodic with period N. Additionally, the DFT of a real
|
||||
signal has a complex conjugate symmetry about angular frequency 0. Whereas a fully complex
|
||||
DFT requires N complex elements to fully specify the signal, the symmetry property of a
|
||||
real DFT means that only N/2 complex elements are needed.
|
||||
|
||||
A consequence of the periodicity and conjugate symmetry properties taken together is that the
|
||||
components of the DFT at DC and at the Nyquist rate must have 0 imaginary part. Rather than
|
||||
require the user to supply an output vector with space allocated for N/2+1 complex elements,
|
||||
we opt to compute the DFT in-place, and pack the Nyquist rate component into the imaginary
|
||||
part of the DC component.
|
||||
|
||||
If the user wishes, she can allocate space for 2 extra int32_t elements (same as 1 complex_s32_t
|
||||
element) in the time-domain buffer (but *not* identified in the length of the time-domain BFP
|
||||
vector, as that must be a power of 2), and unpack the Nyquist component after the FFT is applied
|
||||
(making sure to modify the frequency-domain BFP vectors length by adding 1). Note that
|
||||
bfp_fft_inverse_mono() however expects the data to be packed as bfp_fft_forward_mono() left it.
|
||||
|
||||
Note also that the inverse mono FFT also assumes the conjugate symmetry of the frequency
|
||||
spectrum, meaning that the inverse transform is guaranteed to yield a purely real signal.
|
||||
*/
|
||||
|
||||
#define UNPACK_SPECTRUM_MONO 1
|
||||
|
||||
// Unpack the mono spectrum
|
||||
if(UNPACK_SPECTRUM_MONO) {
|
||||
bfp_fft_unpack_mono(X);
|
||||
}
|
||||
|
||||
// Print out the buffer address and vector length of X, the frequency-domain signal
|
||||
#ifdef __xcore__
|
||||
printf("&X->data[0] --> 0x%08X\n", (unsigned) &X->data[0]);
|
||||
#endif
|
||||
printf("X->length --> %u\n\n", X->length);
|
||||
|
||||
// Print out the complex BFP vector X
|
||||
printf("X = [");
|
||||
for(unsigned int k = 0; k < X->length; k++)
|
||||
printf("(%0.04f + %0.04fj), ", ldexp(X->data[k].re, X->exp), ldexp(X->data[k].im, X->exp) );
|
||||
printf("]\n\n");
|
||||
|
||||
///////////////////////////////
|
||||
// ...
|
||||
// Here you are free to do any required frequency-domain processing of the signal.
|
||||
// ...
|
||||
///////////////////////////////
|
||||
|
||||
// Re-pack the mono spectrum
|
||||
if(UNPACK_SPECTRUM_MONO) {
|
||||
bfp_fft_pack_mono(X);
|
||||
}
|
||||
|
||||
// Apply the inverse FFT.
|
||||
// Like the forward mono FFT, this function returns a pointer, this time a bfp_s32_t*, which is
|
||||
// the function's input parameter recast. In this case we can ignore the returned value, as it
|
||||
// would simply point to x, which is already of the correct type and readily available. In other
|
||||
// situations (e.g. when x exists in a different scope) the returned value may be useful.
|
||||
bfp_fft_inverse_mono(X);
|
||||
|
||||
// Finally, print out the inverse transformed signal, which should match the original signal to
|
||||
// within the arithmetic precision of the forward and inverse transform pair.
|
||||
printf("x = [");
|
||||
for(unsigned int k = 0; k < x.length; k++)
|
||||
printf("%0.04f, ", ldexp(x.data[k], x.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
void fft_stereo_example()
|
||||
{
|
||||
|
||||
printf("\n\n\n");
|
||||
printf("##################\n");
|
||||
printf("### Stereo FFT ###\n");
|
||||
printf("##################\n\n");
|
||||
|
||||
/*
|
||||
This function demonstrates how apply the forward and inverse FFT to a pair of channels, each
|
||||
containing a sequence of real sample data. This is accomplished using the
|
||||
bfp_fft_forward_stereo() and bfp_fft_inverse_stereo() functions respectively.
|
||||
|
||||
2 x bfp_s32_t --bfp_fft_forward_stereo()--> 2 x bfp_complex_s32_t
|
||||
2 x bfp_complex_s32_t --bfp_fft_inverse_stereo()--> 2 x bfp_s32_t
|
||||
*/
|
||||
|
||||
// bfp_fft_forward_stereo() requires 2 input BFP vectors of type bfp_s32_t, whose mantissa
|
||||
// vectors are backed by a int32_t arrays. The +1 is because we're going to do some unpacking of
|
||||
// the spectra after applying the FFT. See fft_mono_example() for unpacking explanation.
|
||||
// The DWORD_ALIGNED qualifier instructs the compiler to ensure the arrays start at an
|
||||
// 8-byte-aligned memory offset.
|
||||
int32_t DWORD_ALIGNED bufferA[LENGTH + 1];
|
||||
int32_t DWORD_ALIGNED bufferB[LENGTH + 1];
|
||||
|
||||
// bfp_fft_forward_stereo() and bfp_fft_inverse_stereo() also require a scratch buffer with
|
||||
// element type complex_s32_t which is the same length (in elements) of the input vectors.
|
||||
complex_s32_t DWORD_ALIGNED scratch[LENGTH];
|
||||
|
||||
// Fill in the buffers with random mantissas (left shift is to ensure some are negative).
|
||||
for(int k = 0; k < LENGTH; k++){
|
||||
bufferA[k] = rand() << 1;
|
||||
bufferB[k] = rand() << 1;
|
||||
}
|
||||
|
||||
// Before doing the forward FFT the array needs to be turned into a proper BFP vector. So we
|
||||
// initialize a pair of bfp_s32_t structs.
|
||||
//
|
||||
// In many situations there is no obvious natural exponent to associate with the time-domain
|
||||
// signal (such as PCM audio streams). In such a situation, it is often convenient to select an
|
||||
// exponent that normalizes the data to a known range, such as [-1.0, 1.0). For 32-bit signed
|
||||
// data, an exponent of -31 does that.
|
||||
//
|
||||
// Note that the final parameter is instructing the init function to compute the headroom of the
|
||||
// input vector. If we instead chose to fill buffer[] with random data *after* initializing x,
|
||||
// there would be no point to computing the headroom here, as it is invalidated the moment we
|
||||
// modify a.data[] or b.data[].
|
||||
bfp_s32_t a, b;
|
||||
bfp_s32_init(&a, bufferA, -31, LENGTH, 1);
|
||||
bfp_s32_init(&b, bufferB, -31, LENGTH, 1);
|
||||
|
||||
// Print out the floating point equivalent values of the input vectors' elements prior to applying
|
||||
// the FFT.
|
||||
printf("a = [");
|
||||
for(unsigned int k = 0; k < a.length; k++)
|
||||
printf("%0.04f, ", ldexp(a.data[k], a.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
printf("b = [");
|
||||
for(unsigned int k = 0; k < b.length; k++)
|
||||
printf("%0.04f, ", ldexp(b.data[k], b.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
// Apply the FFT.
|
||||
//
|
||||
// This function takes a pointer to the input BFP vectors and a scratch buffer
|
||||
bfp_fft_forward_stereo(&a, &b, scratch);
|
||||
|
||||
// The two bfp_s32_t vectors containing the time-domain data have now been clobbered with the
|
||||
// frequency-domain data, and should not be directly accessed. Instead, we alias each bfp_s32_t to
|
||||
// a bfp_complex_s32_t and access the results through that.
|
||||
bfp_complex_s32_t* ChA = (bfp_complex_s32_t*) &a;
|
||||
bfp_complex_s32_t* ChB = (bfp_complex_s32_t*) &b;
|
||||
|
||||
|
||||
/*
|
||||
See the note above (in fft_mono_example()) about the properties of the real DFT that allow it to
|
||||
be computed in-place. Much of the same applies for the stereo DFT.
|
||||
|
||||
It is another property of the DFT that the real and imaginary parts of the input domain are
|
||||
transformed in a way which allows two purely real signals to be DFTed simultaneously and their
|
||||
spectra to be fully separated afterwards. This is done by bfp_fft_forward_stereo().
|
||||
|
||||
Like the mono FFT, each output spectrum (channels A and B) is represented by only N/2 complex
|
||||
elements. This again is allowed as a result of the assumption that the input signals are purely
|
||||
real. Also like the mono FFT, the real part of the Nyquist rate is packed into the imaginary
|
||||
part of the DC element for each of the spectra.
|
||||
|
||||
The spectrum for each channel can be unpacked using bfp_fft_unpack_mono(), just as with
|
||||
bfp_fft_forward_mono().
|
||||
|
||||
If much computation is to be done in the frequency domain this unpacking is likely to be
|
||||
relatively inexpensive and to reduce the likelihood of errors (e.g. complex multiplication of
|
||||
the packed spectrum will likely produce undesired effects).
|
||||
|
||||
Before performing the inverse FFT, the signal must be re-packed with bfp_fft_pack_mono().
|
||||
|
||||
*/
|
||||
|
||||
#define UNPACK_SPECTRA_STEREO 1
|
||||
|
||||
// Unpack the spectra
|
||||
if(UNPACK_SPECTRA_STEREO) {
|
||||
bfp_fft_unpack_mono(ChA);
|
||||
bfp_fft_unpack_mono(ChB);
|
||||
}
|
||||
|
||||
|
||||
// Print out the lengths of the channel A and channel B frequency spectra.
|
||||
printf("ChA.length --> %u\n\n", ChA->length);
|
||||
printf("ChB.length --> %u\n\n", ChB->length);
|
||||
|
||||
// Print out the floating-point equivalent of the channel A and B frequency spectra.
|
||||
printf("ChA = [");
|
||||
for(unsigned int k = 0; k < ChA->length; k++)
|
||||
printf("(%0.04f + %0.04fj), ", ldexp(ChA->data[k].re, ChA->exp), ldexp(ChA->data[k].im, ChA->exp) );
|
||||
printf("]\n\n");
|
||||
|
||||
printf("ChB = [");
|
||||
for(unsigned int k = 0; k < ChB->length; k++)
|
||||
printf("(%0.04f + %0.04fj), ", ldexp(ChB->data[k].re, ChB->exp), ldexp(ChB->data[k].im, ChB->exp) );
|
||||
printf("]\n\n");
|
||||
|
||||
///////////////////////////////
|
||||
// ...
|
||||
// Here you are free to do any required frequency-domain processing of the signal.
|
||||
// ...
|
||||
///////////////////////////////
|
||||
|
||||
|
||||
// Repack the spectra
|
||||
if(UNPACK_SPECTRA_STEREO) {
|
||||
bfp_fft_pack_mono(ChA);
|
||||
bfp_fft_pack_mono(ChB);
|
||||
}
|
||||
|
||||
// Apply the inverse FFT.
|
||||
// This function behaves much like the forward stereo FFT, with the input and output parameters
|
||||
// reversed.
|
||||
bfp_fft_inverse_stereo(ChA, ChB, scratch);
|
||||
|
||||
// ChA and ChB should now be considered clobbered, and `a` and `b` can be used to access the
|
||||
// time-domain data.
|
||||
|
||||
// Finally, print out the inverse transformed signal, which should match the original signal to
|
||||
// within the arithmetic precision of the forward and inverse transform pair.
|
||||
printf("a = [");
|
||||
for(unsigned int k = 0; k < a.length; k++)
|
||||
printf("%0.04f, ", ldexp(a.data[k], a.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
printf("b = [");
|
||||
for(unsigned int k = 0; k < b.length; k++)
|
||||
printf("%0.04f, ", ldexp(b.data[k], b.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
}
|
||||
|
||||
|
||||
void fft_complex_example()
|
||||
{
|
||||
|
||||
printf("\n\n\n");
|
||||
printf("###################\n");
|
||||
printf("### Complex FFT ###\n");
|
||||
printf("###################\n\n");
|
||||
|
||||
/*
|
||||
This function demonstrates how apply the forward and inverse FFT to single channel containing
|
||||
containing a sequence of complex sample data. This is accomplished using the
|
||||
bfp_fft_forward_complex() and bfp_fft_inverse_complex() functions respectively.
|
||||
|
||||
bfp_complex_s32_t --bfp_fft_forward_complex()--> bfp_complex_s32_t
|
||||
bfp_complex_s32_t --bfp_fft_inverse_complex()--> bfp_complex_s32_t
|
||||
*/
|
||||
|
||||
// bfp_fft_forward_complex() requires an input BFP vector of type bfp_complex_s32_t, whose
|
||||
// mantissa vector is backed by a complex_s32_t array.
|
||||
// The complex_s32_t struct contains 2 fields, `re` for the real part and `im` for the
|
||||
// imaginary part of an element. The buffer is then an alternating sequence of real and
|
||||
// imaginary parts for successive elements.
|
||||
complex_s32_t buffer[LENGTH];
|
||||
|
||||
// Fill in the buffer with random mantissas (left shift is to ensure some are negative).
|
||||
for(int k = 0; k < LENGTH; k++){
|
||||
buffer[k].re = rand() << 1;
|
||||
buffer[k].im = rand() << 1;
|
||||
}
|
||||
|
||||
// Before doing the forward FFT the array needs to be turned into a proper BFP vector. So we
|
||||
// initialize a bfp_complex_s32_t with bfp_complex_s32_init().
|
||||
// In many situations there is no obvious natural exponent to associate with the time-domain
|
||||
// signal. In such a situation, it is often convenient to select an exponent that normalizes
|
||||
// the data to a known range, such as [-1.0, 1.0). For 32-bit signed data, an exponent of -31
|
||||
// does that.
|
||||
// Alternatively, initializing with an exponent of 0 has the advantage that, after having
|
||||
// applied the inverse transform, the resulting exponent indicates the overall level of scaling
|
||||
// applied throughout the process.
|
||||
// Note that the final parameter is instructing the init function to compute the headroom of the
|
||||
// input vector. If we instead chose to fill buffer[] with random data *after* initializing x,
|
||||
// there would be no point to computing the headroom here, as it is invalidated the moment we
|
||||
// modify x.data[].
|
||||
bfp_complex_s32_t x;
|
||||
bfp_complex_s32_init(&x, buffer, -31, LENGTH, 1);
|
||||
|
||||
// Print out the floating point equivalent values of the input vector's elements prior to applying
|
||||
// the FFT.
|
||||
printf("x = [");
|
||||
for(unsigned int k = 0; k < x.length; k++)
|
||||
printf("%0.04f + %0.04fj, ", ldexp(x.data[k].re, x.exp), ldexp(x.data[k].im, x.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
// bfp_fft_forward_complex() operates on data in-place. We'll print out the buffer address before
|
||||
// and after the transformation to convince ourselves of this.
|
||||
#ifdef __xcore__
|
||||
printf("&x.data[0] --> 0x%08X\n", (unsigned) &x.data[0]);
|
||||
#endif
|
||||
printf("x.length --> %u\n\n", x.length);
|
||||
|
||||
// Apply the FFT.
|
||||
// Unlike the mono and stereo FFTs, the input and output vector types for the complex FFT are the
|
||||
// the same, so no additional structs or type casts are required.
|
||||
bfp_fft_forward_complex(&x);
|
||||
|
||||
/*
|
||||
Unlike bfp_fft_forward_mono() and bfp_fft_forward_stereo(), the input signal for
|
||||
bfp_fft_forward_complex() is not purely real. As such, a full FFT_N complex elements are
|
||||
required to represent the spectrum.
|
||||
|
||||
Additionally there is no packing of the Nyquist component's real part into the DC component's
|
||||
imaginary part.
|
||||
|
||||
bfp_fft_forward_complex() can be used to perform a DFT on a single channel of purely real data
|
||||
by simply placing that signal in the real parts of a complex vector, with each element's
|
||||
imaginary part being zero. It's worth noting a few differences between doing this and just using
|
||||
bfp_fft_forward_mono().
|
||||
|
||||
1) The complex input vector requires twice as much memory. For long FFTs, this may be
|
||||
appreciable.
|
||||
2) The cycles taken for bfp_fft_forward_mono() to compute an FFT_N-point DFT is approximately the
|
||||
same as the cycles taken to perform an FFT_N/2-point DFT using bfp_fft_forward_complex().
|
||||
3) The complex FFT does not result in a packed signal.
|
||||
*/
|
||||
|
||||
// ... howevr, we'll just create a new pointer to the spectrum to clarify when we're operating
|
||||
// in the time domain when we're in the frequency domain (`X` being frequency domain)
|
||||
bfp_complex_s32_t* X = &x;
|
||||
|
||||
// Print out the address and length of the complex frequency spectrum
|
||||
#ifdef __xcore__
|
||||
printf("&X->data[0] --> 0x%08X\n", (unsigned) &X->data[0]);
|
||||
#endif
|
||||
printf("X->length --> %u\n\n", X->length);
|
||||
|
||||
// Print out the floating-point equivalent of the complex frequency spectrum
|
||||
printf("X = [");
|
||||
for(unsigned int k = 0; k < X->length; k++)
|
||||
printf("(%0.04f + %0.04fj), ", ldexp(X->data[k].re, X->exp), ldexp(X->data[k].im, X->exp) );
|
||||
printf("]\n\n");
|
||||
|
||||
///////////////////////////////
|
||||
// ...
|
||||
// Here you are free to do any required frequency-domain processing of the signal.
|
||||
// ...
|
||||
///////////////////////////////
|
||||
|
||||
// Apply the inverse FFT
|
||||
bfp_fft_inverse_complex(X);
|
||||
|
||||
// Finally, print out the inverse transformed signal, which should match the original signal to
|
||||
// within the arithmetic precision of the forward and inverse transform pair.
|
||||
printf("x = [");
|
||||
for(unsigned int k = 0; k < x.length; k++)
|
||||
printf("%0.04f + %0.04fj, ", ldexp(x.data[k].re, x.exp), ldexp(x.data[k].im, x.exp));
|
||||
printf("]\n\n");
|
||||
|
||||
|
||||
}
|
||||
41
lib_xcore_math/examples/app_filter_demo/CMakeLists.txt
Normal file
41
lib_xcore_math/examples/app_filter_demo/CMakeLists.txt
Normal file
@@ -0,0 +1,41 @@
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
|
||||
project(app_filter_demo)
|
||||
|
||||
if(DEFINED XCORE_TARGET)
|
||||
set(APP_HW_TARGET ${XCORE_TARGET})
|
||||
else()
|
||||
set(APP_HW_TARGET XK-EVK-XU316)
|
||||
endif()
|
||||
|
||||
set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
|
||||
|
||||
set(APP_COMPILER_FLAGS -Os)
|
||||
|
||||
if(NOT BUILD_NATIVE)
|
||||
list(APPEND APP_COMPILER_FLAGS -fxscope
|
||||
-report
|
||||
)
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
list(APPEND APP_COMPILER_FLAGS -Werror
|
||||
-g
|
||||
)
|
||||
else()
|
||||
list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
|
||||
# Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
|
||||
# See online help for details.
|
||||
-D_CRT_SECURE_NO_WARNINGS
|
||||
# Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
|
||||
/wd5045
|
||||
)
|
||||
endif()
|
||||
|
||||
if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
|
||||
list(APPEND APP_COMPILER_FLAGS -lm)
|
||||
endif()
|
||||
|
||||
XMOS_REGISTER_APP()
|
||||
174
lib_xcore_math/examples/app_filter_demo/src/filter_16bit_fir.c
Normal file
174
lib_xcore_math/examples/app_filter_demo/src/filter_16bit_fir.c
Normal file
@@ -0,0 +1,174 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "xcore_math.h"
|
||||
|
||||
|
||||
#define TAP_COUNT (35)
|
||||
|
||||
#define FILTER_TICKS (50)
|
||||
|
||||
|
||||
void filter_16bit_fir()
|
||||
{
|
||||
printf("\n\n\n\n");
|
||||
printf("==========================\n");
|
||||
printf("== 16-bit FIR Filtering ==\n");
|
||||
printf("==========================\n");
|
||||
|
||||
/**
|
||||
* In this example we'll look at how to use the 16-bit FIR filtering functions.
|
||||
*
|
||||
* The applied filter logic is approximately:
|
||||
*
|
||||
* output[n] = sum_k{ input[n-k] * b[k] } >> shift
|
||||
* where
|
||||
* > output[n] is the filter output at time n
|
||||
* > input[n-k] is the filter input k time steps before time n
|
||||
* > b[k] is the user-supplied filter coefficient associated with time lag k
|
||||
* (i.e. larger k is applied to older samples)
|
||||
* > shift is a user-specified power-of-two scale factor
|
||||
* > sum_k{} is a sum over variable k with 0 < k < N_taps, and
|
||||
* > N_taps is the number of filter taps.
|
||||
*
|
||||
* The multiplication inside the sum is a full 32-bit multiplication. The maximum value of
|
||||
* any such product is then
|
||||
* ((-2^15)*(-2^15)) = 2^(30). These (effectively 31-bit) products are accumulated into a
|
||||
* 32-bit saturating accumulator. Finally, a rounding, saturating right-shift of `shift` bits
|
||||
* is applied to the 32-bit sum to get the final result which is output. The final shift
|
||||
* saturates to the symmetric 16-bit bounds of INT16_MAX and -INT16_MAX. (Note that the
|
||||
* 32-bit accumulator saturates to symmetric 32-bit bounds.)
|
||||
*
|
||||
* Below we'll declare and initialize a 16-bit FIR filter with TAP_COUNT random coefficients.
|
||||
* We'll then iterate for TAP_COUNT steps, adding samples to the filter without actually computing
|
||||
* the filter output. Finally, we'll iterate another FILTER_TICKS steps, adding random samples to
|
||||
* the filter and calculating the outputs.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Declare the required filter struct, filter_fir_s16_t.
|
||||
*
|
||||
* If your application requires multiple 16-bit FIR filters, one of these should be allocated
|
||||
* for each such filter.
|
||||
*/
|
||||
filter_fir_s16_t filter;
|
||||
|
||||
/**
|
||||
* Allocate buffers for the filter coefficients and the filter state.
|
||||
*
|
||||
* To initialize the filter, the init function needs to be provided with the filter coefficients.
|
||||
* In practice, if filter coefficients are known in advance, these can be declared as read-only
|
||||
* memory. They will not be modified by the filter API.
|
||||
*
|
||||
* The filter API uses a circular buffer to mangage the filter's state for the user. To do this,
|
||||
* it requires a user-supplied buffer in which to place the historical sample data.
|
||||
*
|
||||
* Both coefficient and sample buffers must be larger enough to contain TAP_COUNT int16_t
|
||||
* elements.
|
||||
*
|
||||
* Additionally, both buffers must begin at a 4-byte- (word-) aligned address. This alignment
|
||||
* should be manually specified if the buffer was originally declared as a char array, for
|
||||
* example.
|
||||
*/
|
||||
int16_t WORD_ALIGNED coefficients[TAP_COUNT] = {0};
|
||||
int16_t WORD_ALIGNED sample_buffer[TAP_COUNT] = {0};
|
||||
|
||||
/**
|
||||
* How to choose the final shift factor for a filter is beyond the scope of this example, but
|
||||
* here a final scale of 15 bits seems to work correctly. Note that this shift is unsigned
|
||||
* (i.e. negative values are treated as large right-shifts; they will not left-shift).
|
||||
*/
|
||||
right_shift_t filter_shr = 15;
|
||||
|
||||
/**
|
||||
* Choose some random coefficients
|
||||
*
|
||||
* We have to be more careful here selecting coefficients than we were with the 32-bit FIR filter.
|
||||
* The difference is that the 32-bit FIR filter has a built-in 30-bit right-shift when the product
|
||||
* of the coefficients and samples are computed, which substantially limits how quickly the
|
||||
* accumulators are able to fill up. With the 16-bit FIR filters, the internal accumulator can
|
||||
* only store 32-bits.
|
||||
*
|
||||
* The largest product of two 16-bit numbers is (-2^15 * -2^15) = 2^30. The largest supported
|
||||
* accumulator value is approximately 2^31. That means that in the worst case it takes only two
|
||||
* taps to hit saturation with a 16-bit FIR filter.
|
||||
*
|
||||
* Instead, we want the absolute sum of coefficients to be at most 2^16 ( 2^15 * 2^16 = 2^31 ).
|
||||
* So, we'll first pick random coefficients, then we'll normalize them to ensure they don't
|
||||
* exceed that.
|
||||
*/
|
||||
if(1){
|
||||
int64_t coef_sum = 0;
|
||||
for(int lag = 0; lag < TAP_COUNT; lag++){
|
||||
coefficients[lag] = (int16_t) rand();
|
||||
coef_sum += abs(coefficients[lag]);
|
||||
}
|
||||
|
||||
double norm = ldexp(1.0, 16) / ldexp((double) coef_sum, 0);
|
||||
|
||||
for(int lag = 0; lag < TAP_COUNT; lag++){
|
||||
coefficients[lag] = (int16_t) (coefficients[lag] * norm);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize filter with filter_fir_s16_init().
|
||||
*
|
||||
* The parameters here are straight-forward, given the description of filter behavior
|
||||
* above.
|
||||
*/
|
||||
filter_fir_s16_init(&filter, sample_buffer, TAP_COUNT, coefficients, filter_shr);
|
||||
|
||||
// Print out the filter coefficients
|
||||
printf("Filter Coefficients:\n");
|
||||
|
||||
printf("b = [");
|
||||
for(int k = 0; k < TAP_COUNT; k++){
|
||||
printf("% 7d, ", filter.coef[k]);
|
||||
if(k % 8 == 7) printf("\n ");
|
||||
}
|
||||
printf("]\n");
|
||||
|
||||
printf("filter_shr = %d\n\n", filter.shift);
|
||||
|
||||
|
||||
// Print the equivalent floating-point FIR filter coefficients
|
||||
printf("b_float = [");
|
||||
for(int k = 0; k < TAP_COUNT; k++){
|
||||
printf("%f, ", ldexp(filter.coef[k], -filter.shift));
|
||||
if(k % 6 == 5) printf("\n ");
|
||||
}
|
||||
printf("]\n\n");
|
||||
|
||||
/**
|
||||
* Using filter_s16_add_sample(), supply the filter with random input samples without actually
|
||||
* computing any filtered output.
|
||||
*
|
||||
* This may be useful, for example, to quickly fill up a filters state with inputs, without
|
||||
* incurring the cost to compute the filtered output for each sample.
|
||||
*/
|
||||
for(int k = -TAP_COUNT; k < 0; k++) {
|
||||
int16_t input_sample = (int16_t) rand();
|
||||
filter_fir_s16_add_sample(&filter, input_sample);
|
||||
|
||||
printf("input[% 4d] = % 7d; output[% 4d] = (N/A)\n", k, input_sample, k);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
/**
|
||||
* Using filter_fir_s16(), supply the filter with new random input samples, computing the filtered
|
||||
* output value each time.
|
||||
*/
|
||||
for(int k = 0; k < FILTER_TICKS; k++){
|
||||
int16_t input_sample = (int16_t) rand();
|
||||
int16_t output_sample = filter_fir_s16(&filter, input_sample);
|
||||
|
||||
printf("input[% 4d] = % 7d; output[% 4d] = % 7d\n", k, input_sample, k, output_sample);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "xcore_math.h"
|
||||
|
||||
// Each filter_biquad_s32_t can store (up to) 8 biquad filter sections
|
||||
#define SECTION_COUNT 8
|
||||
#define FILTER_TICKS 120
|
||||
|
||||
/**
|
||||
* Biquad filter used in this example.
|
||||
*
|
||||
* Unlike the 16- and 32-bit FIR filters, 32-bit biquad filters store their state and coefficients
|
||||
* within the filter struct itself.
|
||||
*
|
||||
*/
|
||||
filter_biquad_s32_t filter = {
|
||||
// Number of biquad sections in this filter block
|
||||
.biquad_count = SECTION_COUNT,
|
||||
|
||||
// Filter state, initialized to 0
|
||||
.state = {{0}},
|
||||
|
||||
// Filter coefficients
|
||||
.coef = {
|
||||
{ 0x160cc0a, 0x28aa355, 0x323dae1, 0x1f12a2a, 0x32fc6d, 0x9a142d6, 0xc88bad1, 0x541f6bf },
|
||||
{ 0xb4007f5, 0x4e11ec, 0x362dabe, 0x9bfb667, 0x405747, 0x6d593cd, 0xae295e3, 0xac62779 },
|
||||
{ 0xb5a000d, 0xc84d2fc, 0xa20394f, 0xc9bb57, 0x454e970, 0x912f78e, 0xcf4303, 0x900fdb2 },
|
||||
{-0x574fcb8, -0x6c18c97, -0x2228d4a, -0x6ab3e49, -0xe9532b, -0x2876ab2, -0x2d4d1e7, -0x11aaecb },
|
||||
{-0x4329901, -0x19826ba, -0x7b349e9, -0x7b0ec2e, -0x96e1eb2, -0xbe6675, -0x92fa8be, -0x9cf08a7 }
|
||||
}
|
||||
};
|
||||
|
||||
void filter_32bit_biquad()
|
||||
{
|
||||
printf("\n\n\n\n");
|
||||
printf("=============================\n");
|
||||
printf("== 32-bit Biquad Filtering ==\n");
|
||||
printf("=============================\n");
|
||||
|
||||
/**
|
||||
* In this example, we'll look at how to use the 32-bit biquad filtering functions.
|
||||
*
|
||||
* 32-bit biquad filters use the filter_biquad_s32_t type. Unlike the 16- and 32-bit FIR
|
||||
* filters, the 32-bit biquad filter type carries its own data and need not be initialized.
|
||||
* (see above)
|
||||
*
|
||||
* The coefficients of the 32-bit biquad filter are encoded as Q2.30 fixed-point values. If x[n]
|
||||
* is the input to a biquad section at time n, and y[n] is its output, then the relationship
|
||||
* between y[n] and x[n] for a single biquad filter section is:
|
||||
*
|
||||
* y[n] = ((b0 * x[n ]) >> 30)
|
||||
* + ((b1 * x[n-1]) >> 30)
|
||||
* + ((b2 * x[n-2]) >> 30)
|
||||
* - ((a1 * y[n-1]) >> 30)
|
||||
* - ((a2 * y[n-2]) >> 30)
|
||||
*
|
||||
* where the right-shifts are rounding, and b0, b1, b2, -a1, and -a2 are `filter.coef[0][k]`,
|
||||
* `filter.coef[1][k]`, `filter.coef[2][k]`, `filter.coef[3][k]`, and `filter.coef[4][k]`
|
||||
* respectively, for the kth biquad section. The output y[n] is then also the input to the
|
||||
* (k+1)th biquad section.
|
||||
*
|
||||
* A single filter_biquad_s32_t stores coefficients and state information for up to 8 biquad
|
||||
* sections (the optimal block size for computation). If more than 8 biquad sections are required,
|
||||
* an array of filter_biquad_s32_t structs can be used, and the filter can be executed using
|
||||
* filter_biquads_s32() instead of filter_biquad_s32().
|
||||
*/
|
||||
|
||||
// Print out the coefficients for each biquad filter section
|
||||
for(unsigned int section = 0; section < filter.biquad_count; section++){
|
||||
printf("Biquad Section[%d]\n", section);
|
||||
printf(" -a1 = %f\n", ldexp(filter.coef[3][section], -30));
|
||||
printf(" -a2 = %f\n", ldexp(filter.coef[4][section], -30));
|
||||
printf(" b0 = %f\n", ldexp(filter.coef[0][section], -30));
|
||||
printf(" b1 = %f\n", ldexp(filter.coef[1][section], -30));
|
||||
printf(" b2 = %f\n", ldexp(filter.coef[2][section], -30));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Process random samples.
|
||||
* Because we're only using 1 filter block here, we use filter_biquad_s32().
|
||||
*/
|
||||
for(int k = 0; k < FILTER_TICKS; k++){
|
||||
int32_t input_sample = rand() << 1;
|
||||
int32_t output_sample = filter_biquad_s32(&filter, input_sample);
|
||||
|
||||
printf("input[% 4d] = % 13ld; output[% 4d] = % 13ld\n", k, (long int) input_sample, k, (long int) output_sample);
|
||||
}
|
||||
|
||||
}
|
||||
152
lib_xcore_math/examples/app_filter_demo/src/filter_32bit_fir.c
Normal file
152
lib_xcore_math/examples/app_filter_demo/src/filter_32bit_fir.c
Normal file
@@ -0,0 +1,152 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "xcore_math.h"
|
||||
|
||||
|
||||
#define TAP_COUNT (35)
|
||||
|
||||
#define FILTER_TICKS (50)
|
||||
|
||||
|
||||
void filter_32bit_fir()
|
||||
{
|
||||
printf("\n\n\n\n");
|
||||
printf("==========================\n");
|
||||
printf("== 32-bit FIR Filtering ==\n");
|
||||
printf("==========================\n");
|
||||
|
||||
/**
|
||||
* In this example we'll look at how to use the 32-bit FIR filtering functions.
|
||||
*
|
||||
* The applied filter logic is approximately:
|
||||
*
|
||||
* output[n] = sum_k{ (input[n-k] * b[k]) >> 30 } >> shift
|
||||
* where
|
||||
* > output[n] is the filter output at time n
|
||||
* > input[n-k] is the filter input k time steps before time n
|
||||
* > b[k] is the user-supplied filter coefficient associated with time lag k
|
||||
* (i.e. larger k is applied to older samples)
|
||||
* > shift is a user-specified power-of-two scale factor
|
||||
* > sum_k{} is a sum over variable k with 0 < k < N_taps, and
|
||||
* > N_taps is the number of filter taps.
|
||||
*
|
||||
* The multiplication inside the sum is a full 64-bit multiplication with a built-in 30-bit
|
||||
* rounding right-shift (as shown). The maximum value of any such product is then
|
||||
* ((-2^31)*(-2^31)*(2^(-30))) = 2^(62-30) = 2^(32). These (effectively 33-bit) products
|
||||
* are accumulated into a set of 8 internal 40-bit saturating accumulators, accumulator p
|
||||
* containing the partial sum of the filter indices p mod 8. The 8 accumulators containing the
|
||||
* 40-bit partial sums are then added together for an (at-most) 43-bit final sum. Finally, a
|
||||
* rounding, saturating right-shift of `shift` bits is applied to the 43-bit sum to get the
|
||||
* final result which is output. The final shift saturates to the symmetric 32-bit bounds
|
||||
* of INT32_MAX and -INT32_MAX. (Note that each of the 40-bit accumulators saturates to
|
||||
* symmetric 40-bit bounds.)
|
||||
*
|
||||
* Below we'll declare and initialize a 32-bit FIR filter with TAP_COUNT random coefficients.
|
||||
* We'll then iterate for TAP_COUNT steps, adding samples to the filter without actually computing
|
||||
* the filter output. Finally, we'll iterate another FILTER_TICKS steps, adding random samples to
|
||||
* the filter and calculating the outputs.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Declare the required filter struct, filter_fir_s32_t.
|
||||
*
|
||||
* If your application requires multiple 32-bit FIR filters, one of these should be allocated
|
||||
* for each such filter.
|
||||
*/
|
||||
filter_fir_s32_t filter;
|
||||
|
||||
/**
|
||||
* Allocate buffers for the filter coefficients and the filter state.
|
||||
*
|
||||
* To initialize the filter, the init function needs to be provided with the filter coefficients.
|
||||
* In practice, if filter coefficients are known in advance, these can be declared as read-only
|
||||
* memory. They will not be modified by the filter API.
|
||||
*
|
||||
* The filter API uses a circular buffer to mangage the filter's state for the user. To do this,
|
||||
* it requires a user-supplied buffer in which to place the historical sample data.
|
||||
*
|
||||
* Both coefficient and sample buffers must be larger enough to contain TAP_COUNT int32_t
|
||||
* elements.
|
||||
*
|
||||
* Additionally, both buffers must begin at a 4-byte- (word-) aligned address. This is the default
|
||||
* behavior for int32_t arrays, but use of the WORD_ALIGNED macro is shown here for completeness.
|
||||
* This alignment should be manually specified if the buffer was originally declared as a char
|
||||
* array, for example.
|
||||
*/
|
||||
int32_t WORD_ALIGNED coefficients[TAP_COUNT] = {0};
|
||||
int32_t WORD_ALIGNED sample_buffer[TAP_COUNT] = {0};
|
||||
|
||||
/**
|
||||
* How to choose the final shift factor for a filter is beyond the scope of this example, but
|
||||
* here a final scale of 2 bits seems to work correctly. Note that this shift is unsigned
|
||||
* (i.e. negative values are treated as large right-shifts; they will not left-shift).
|
||||
*/
|
||||
right_shift_t filter_shr = 2;
|
||||
|
||||
// Choose some random coefficients
|
||||
for(int lag = 0; lag < TAP_COUNT; lag++){
|
||||
coefficients[lag] = rand() << 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize filter with filter_fir_s32_init().
|
||||
*
|
||||
* The parameters here are straight-forward, given the description of filter behavior
|
||||
* above.
|
||||
*/
|
||||
filter_fir_s32_init(&filter, sample_buffer, TAP_COUNT, coefficients, filter_shr);
|
||||
|
||||
// Print out the filter coefficients
|
||||
printf("Filter Coefficients:\n");
|
||||
|
||||
printf("b = [");
|
||||
for(int k = 0; k < TAP_COUNT; k++){
|
||||
printf("% 11ld, ", (long int) filter.coef[k]);
|
||||
if(k % 8 == 7) printf("\n ");
|
||||
}
|
||||
printf("]\n");
|
||||
|
||||
printf("filter_shr = %d\n\n", filter.shift);
|
||||
|
||||
|
||||
// Print the equivalent floating-point FIR filter coefficients
|
||||
printf("b_float = [");
|
||||
for(int k = 0; k < TAP_COUNT; k++){
|
||||
printf("%f, ", ldexp(filter.coef[k], -30-filter.shift));
|
||||
if(k % 6 == 5) printf("\n ");
|
||||
}
|
||||
printf("]\n\n");
|
||||
|
||||
/**
|
||||
* Using filter_s32_add_sample(), supply the filter with random input samples without actually
|
||||
* computing any filtered output.
|
||||
*
|
||||
* This may be useful, for example, to quickly fill up a filters state with inputs, without
|
||||
* incurring the cost to compute the filtered output for each sample.
|
||||
*/
|
||||
for(int k = -TAP_COUNT; k < 0; k++) {
|
||||
int32_t input_sample = rand() << 1;
|
||||
filter_fir_s32_add_sample(&filter, input_sample);
|
||||
|
||||
printf("input[% 4d] = % 12ld; output[% 4d] = (N/A)\n", k, (long int) input_sample, k);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
/**
|
||||
* Using filter_fir_s32(), supply the filter with new random input samples, computing the filtered
|
||||
* output value each time.
|
||||
*/
|
||||
for(int k = 0; k < FILTER_TICKS; k++){
|
||||
int32_t input_sample = rand() << 1;
|
||||
int32_t output_sample = filter_fir_s32(&filter, input_sample);
|
||||
|
||||
printf("input[% 4d] = % 12ld; output[% 4d] = % 12ld\n", k, (long int) input_sample, k, (long int) output_sample);
|
||||
}
|
||||
|
||||
}
|
||||
31
lib_xcore_math/examples/app_filter_demo/src/main.c
Normal file
31
lib_xcore_math/examples/app_filter_demo/src/main.c
Normal file
@@ -0,0 +1,31 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __XS3A__
|
||||
# include <xscope.h>
|
||||
#endif
|
||||
|
||||
void filter_32bit_fir();
|
||||
void filter_16bit_fir();
|
||||
void filter_32bit_biquad();
|
||||
|
||||
#define RAND_SEED 76465367
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
#ifdef __XS3A__
|
||||
xscope_config_io(XSCOPE_IO_BASIC);
|
||||
#endif
|
||||
|
||||
// Seed the random number generator, using a constant for reproducibility
|
||||
srand(RAND_SEED);
|
||||
|
||||
// filter_32bit_fir();
|
||||
// filter_16bit_fir();
|
||||
filter_32bit_biquad();
|
||||
|
||||
return 0;
|
||||
}
|
||||
41
lib_xcore_math/examples/app_vect_demo/CMakeLists.txt
Normal file
41
lib_xcore_math/examples/app_vect_demo/CMakeLists.txt
Normal file
@@ -0,0 +1,41 @@
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
|
||||
project(app_vect_demo)
|
||||
|
||||
if(DEFINED XCORE_TARGET)
|
||||
set(APP_HW_TARGET ${XCORE_TARGET})
|
||||
else()
|
||||
set(APP_HW_TARGET XK-EVK-XU316)
|
||||
endif()
|
||||
|
||||
set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
|
||||
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
|
||||
|
||||
set(APP_COMPILER_FLAGS -Os)
|
||||
|
||||
if(NOT BUILD_NATIVE)
|
||||
list(APPEND APP_COMPILER_FLAGS -fxscope
|
||||
-report
|
||||
)
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
list(APPEND APP_COMPILER_FLAGS -Werror
|
||||
-g
|
||||
)
|
||||
else()
|
||||
list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
|
||||
# Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
|
||||
# See online help for details.
|
||||
-D_CRT_SECURE_NO_WARNINGS
|
||||
# Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
|
||||
/wd5045
|
||||
)
|
||||
endif()
|
||||
|
||||
if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
|
||||
list(APPEND APP_COMPILER_FLAGS -lm)
|
||||
endif()
|
||||
|
||||
XMOS_REGISTER_APP()
|
||||
29
lib_xcore_math/examples/app_vect_demo/src/main.c
Normal file
29
lib_xcore_math/examples/app_vect_demo/src/main.c
Normal file
@@ -0,0 +1,29 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __XS3A__
|
||||
# include <xscope.h>
|
||||
#endif
|
||||
|
||||
void vect_s32_example();
|
||||
void vect_complex_s16_example();
|
||||
|
||||
#define RAND_SEED 0xDEFACED
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
#ifdef __XS3A__
|
||||
xscope_config_io(XSCOPE_IO_BASIC);
|
||||
#endif
|
||||
|
||||
// Seed the random number generator, using a constant for reproducibility
|
||||
srand(RAND_SEED);
|
||||
|
||||
vect_s32_example();
|
||||
vect_complex_s16_example();
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,228 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "xcore_math.h"
|
||||
|
||||
|
||||
// Prints a BFP vector both in its mantissa-exponent form and as a floating point vector. Also prints headroom.
|
||||
static void print_vector_complex_s16(
|
||||
const int16_t real[],
|
||||
const int16_t imag[],
|
||||
const char* name,
|
||||
const exponent_t exp,
|
||||
const unsigned length,
|
||||
const unsigned line)
|
||||
{
|
||||
// First, the raw mantissas
|
||||
printf("%s = [", name);
|
||||
for(unsigned int k = 0; k < length; k++)
|
||||
printf("%d + j*%d, ", real[k], imag[k]);
|
||||
printf("] * 2**(%d)\t// (line %u)\n", exp, line);
|
||||
|
||||
// Next, the float equivalent
|
||||
printf("%s_float = [", name);
|
||||
for(unsigned int k = 0; k < length; k++)
|
||||
printf("%0.07f + j*%0.07f, ", ldexp(real[k], exp), ldexp(imag[k], exp));
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
#define PRINT_VECT_C16(X) print_vector_complex_s16(X.real, X.imag, #X, X.exp, LENGTH, __LINE__)
|
||||
|
||||
// The length (in elements) of our BFP vectors. We're using a small length to keep the text output relatively tidy.
|
||||
#define LENGTH (4)
|
||||
|
||||
|
||||
|
||||
void vect_complex_s16_example()
|
||||
{
|
||||
printf("\n\n\n\n");
|
||||
printf("================================\n");
|
||||
printf("== vect_complex_s16_example() ==\n");
|
||||
printf("================================\n");
|
||||
|
||||
/*
|
||||
Here we'll look at the functin vect_complex_s16_add().
|
||||
|
||||
vect_complex_s16_add() is an element-wise addition of two complex 16-bit vectors.
|
||||
|
||||
headroom_t vect_complex_s16_add(
|
||||
int16_t a_real[],
|
||||
int16_t a_imag[],
|
||||
const int16_t b_real[],
|
||||
const int16_t b_imag[],
|
||||
const int16_t c_real[],
|
||||
const int16_t c_imag[],
|
||||
const unsigned length,
|
||||
const right_shift_t b_shr,
|
||||
const right_shift_t c_shr);
|
||||
|
||||
The first six parameters are the real and imaginary parts of each of the input and output
|
||||
vectors. Specifically, if A[] is (conceptually) the complex block floating-point output
|
||||
vector, then a_real[] and a_imag[] store the real and imaginary parts respectively of A[]'s
|
||||
mantissas. Likewise with complex input vectors B[] and C[]. So, the mantissa of an element
|
||||
B[k] is (b_real[k] + j*b_imag[k]), and if the exponent of B[] is B_exp, then the rational
|
||||
value of B[k] is (b_real[k] + j*b_imag[k]) * 2^(B_exp).
|
||||
|
||||
Each of the first six parameters must be word-aligned and point to a buffer that can hold
|
||||
at least `length` elements.
|
||||
|
||||
b_shr and c_shr are arithmetic right-shifts applied to the mantissas of B[] and C[]
|
||||
respectively prior to being added. These shifts are necessary not only to prevent
|
||||
overflow/saturation, but also because in order for adding two mantissas together to be
|
||||
a meaningful operation, they must correspond to the same exponent. For example, if we
|
||||
two rational values (1 * 10^0) and (2 * 10^-1), each representing the number 1.0,
|
||||
if we simply add together that mantissas (1+2=3) we get a result in which no integer
|
||||
power of 10 gives a correct solution.
|
||||
|
||||
b_shr and c_shr, as well as the exponent associated with the output vector A[] can be
|
||||
calculated using vect_complex_s16()'s prepare function, vect_complex_s16_prepare().
|
||||
|
||||
void vect_complex_s16_prepare(
|
||||
exponent_t* a_exp,
|
||||
right_shift_t* b_shr,
|
||||
right_shift_t* c_shr,
|
||||
const exponent_t b_exp,
|
||||
const exponent_t c_exp,
|
||||
const headroom_t b_hr,
|
||||
const headroom_t c_hr);
|
||||
|
||||
ASIDE: Technically vect_complex_s16_add_prepare is a macro which resolves to
|
||||
vect_s32_add_prepare. The logic for computing the shifts and output exponent for
|
||||
vect_complex_s16_add() and vect_s32_add() is the same. The macro is used to
|
||||
provide a more uniform API for the user (i.e. the prepare function for operation 'X()'
|
||||
is 'X_prepare()').
|
||||
|
||||
Here, a_exp is an output parameter which is the exponent associated with the ouput vector
|
||||
A[]. b_shr and c_shr are output parameters which correspond to the b_shr and c_shr used
|
||||
for vect_complex_s16_add(). b_exp and b_hr are the exponent and vector headroom of
|
||||
input vector B[]. Likewise with c_exp, c_hr and C[].
|
||||
|
||||
Finally, the output of vect_complex_s16_add() is the headroom of the complex output
|
||||
vector A[]. For a 16-bit complex BFP vector, the headroom is the minimum of the headroom
|
||||
of each of the vector's complex mantissas. The headroom of a complex mantissa is the
|
||||
minimum of the headroom of its real part and its imaginary part.
|
||||
|
||||
*/
|
||||
|
||||
// This is the data we'll use for vectors A[], B[] and C[]. Note that the real[] and imag[]
|
||||
// arrays are declared as word-aligned. This is a requirement for most vect_* functions.
|
||||
// Also note that this data structure looks very similar to bfp_complex_s16_t, which the
|
||||
// high-level BFP API uses for complex 16-bit vectors.
|
||||
struct {
|
||||
int16_t WORD_ALIGNED real[LENGTH];
|
||||
int16_t WORD_ALIGNED imag[LENGTH];
|
||||
exponent_t exp;
|
||||
headroom_t hr;
|
||||
} A, B, C;
|
||||
// To be clear about the notation used (in the comments) here, A[], B[] and C[] refer to the
|
||||
// block floating-point vectors, and e.g. A[k] refers to the complex rational value
|
||||
// ((A.real[k] + j*A.imag[k]) * 2^(A.exp)).
|
||||
|
||||
|
||||
// Setting our initial exponents to -15 means that the vector elements will span the range
|
||||
// [-1.0, 1.0), because INT16_MIN is -2^15 and INT16_MAX is (2^15)-1.
|
||||
B.exp = -15;
|
||||
C.exp = -15;
|
||||
|
||||
// Randomly set the mantissas for B[] and C[]
|
||||
for(int k = 0; k < LENGTH; k++){
|
||||
B.real[k] = rand();
|
||||
B.imag[k] = rand();
|
||||
C.real[k] = rand();
|
||||
C.imag[k] = rand();
|
||||
}
|
||||
|
||||
|
||||
// We haven't yet initialized the headroom of vectors B[] and C[]. Because we need to know their
|
||||
// headroom to properly call vect_complex_s16_add_prepare(), we need to initialize these.
|
||||
// vect_complex_s16_headroom() computes the headroom for a complex 16-bit mantissa vector.
|
||||
{
|
||||
printf("\n\n\n============ Step 1 ============\n");
|
||||
|
||||
B.hr = vect_complex_s16_headroom(B.real, B.imag, LENGTH);
|
||||
C.hr = vect_complex_s16_headroom(C.real, C.imag, LENGTH);
|
||||
|
||||
PRINT_VECT_C16(B);
|
||||
PRINT_VECT_C16(C);
|
||||
|
||||
printf("B.hr --> %u\n", B.hr);
|
||||
printf("C.hr --> %u\n", C.hr);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Now that we've initialized everything, we can execute the add operation.
|
||||
{
|
||||
printf("\n\n\n============ Step 2 ============\n");
|
||||
|
||||
// First, we prepare, which tells us the output exponent A.exp and the shift values
|
||||
// needed by vect_complex_s16_add()
|
||||
right_shift_t b_shr, c_shr;
|
||||
vect_complex_s16_add_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
|
||||
|
||||
printf("A.exp --> %d\n", A.exp);
|
||||
printf("b_shr --> %d\n", b_shr);
|
||||
printf("c_shr --> %d\n", c_shr);
|
||||
|
||||
// Then we call the operator, making sure that we capture the headroom of A[], returned by
|
||||
// the function.
|
||||
A.hr = vect_complex_s16_add(A.real, A.imag, B.real, B.imag,
|
||||
C.real, C.imag, LENGTH, b_shr, c_shr);
|
||||
|
||||
PRINT_VECT_C16(A);
|
||||
printf("A.hr --> %u\n", A.hr);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Perhaps we need out complex 16-bit output vector A[] to be in the particular fixed-point
|
||||
// Q-format Q8.8, in which the mantissa's 8 least significant bits are fractional. We can
|
||||
// achieve this adjusting the results of the prepare step.
|
||||
{
|
||||
printf("\n\n\n============ Step 3 ============\n");
|
||||
|
||||
// First, prepare as we did before.
|
||||
right_shift_t b_shr, c_shr;
|
||||
vect_complex_s16_add_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
|
||||
|
||||
// The exponent have been chosen to maximize the precision of the result vector
|
||||
printf("A.exp --> %d (before adjustment)\n", A.exp);
|
||||
printf("b_shr --> %d (before adjustment)\n", b_shr);
|
||||
printf("c_shr --> %d (before adjustment)\n", c_shr);
|
||||
|
||||
// To force an output Q-format of Q8.8, we find the adjustment necessary to change the
|
||||
// output exponent to the negative of the number of fractional bits:
|
||||
const exponent_t required_exp = -8;
|
||||
const int adjustment = required_exp - A.exp;
|
||||
|
||||
// This adjustment is used to modify the shift parameters and (for correctness) the output
|
||||
// exponent.
|
||||
|
||||
// ASIDE: If you've looked at the vect_s32_example() where we enforced an output Q-format of
|
||||
// Q8.24 from vect_s32_mul(), you may notice that the logic here looks different.
|
||||
// In particular, in that example, the adjustment was split between b_shr and c_shr,
|
||||
// whereas here b_shr and c_shr each receive the full adjustment. This is a genuine
|
||||
// difference in the logic of addition versus multiplication. To make these sorts of
|
||||
// manual optimizations, *you must understand how the mantissas and exponents of the
|
||||
// inputs and outputs of that particular operation relate to one another*.
|
||||
b_shr += adjustment;
|
||||
c_shr += adjustment;
|
||||
A.exp += adjustment;
|
||||
|
||||
printf("A.exp --> %d (after adjustment)\n", A.exp);
|
||||
printf("b_shr --> %d (after adjustment)\n", b_shr);
|
||||
printf("c_shr --> %d (after adjustment)\n", c_shr);
|
||||
|
||||
// The the adjustments complete, we call vect_complex_s16_add() as usual.
|
||||
A.hr = vect_complex_s16_add(A.real, A.imag, B.real, B.imag,
|
||||
C.real, C.imag, LENGTH, b_shr, c_shr);
|
||||
|
||||
PRINT_VECT_C16(A);
|
||||
printf("A.hr --> %u\n", A.hr);
|
||||
}
|
||||
|
||||
}
|
||||
299
lib_xcore_math/examples/app_vect_demo/src/vect_s32_example.c
Normal file
299
lib_xcore_math/examples/app_vect_demo/src/vect_s32_example.c
Normal file
@@ -0,0 +1,299 @@
|
||||
// Copyright 2020-2024 XMOS LIMITED.
|
||||
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "xcore_math.h"
|
||||
|
||||
|
||||
// Prints a BFP vector both in its mantissa-exponent form and as a floating point vector. Also
|
||||
// prints headroom.
|
||||
static void print_vector_s32(
|
||||
const int32_t vect[],
|
||||
const char* name,
|
||||
const exponent_t exp,
|
||||
const unsigned length,
|
||||
const unsigned line)
|
||||
{
|
||||
// First, the raw mantissas
|
||||
printf("%s = [", name);
|
||||
for(unsigned int k = 0; k < length; k++)
|
||||
printf("%ld, ", (long int) vect[k]);
|
||||
printf("] * 2**(%d)\t// (line %u)\n", exp, line);
|
||||
|
||||
// Next, the float equivalent
|
||||
printf("%s_float = [", name);
|
||||
for(unsigned int k = 0; k < length; k++)
|
||||
printf("%0.07f, ", ldexp(vect[k], exp));
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
#define PRINT_VECT_S32(X) print_vector_s32(X.data, #X, X.exp, LENGTH, __LINE__)
|
||||
|
||||
// The length (in elements) of our BFP vectors. We're using a small length to keep the text output
|
||||
// relatively tidy.
|
||||
#define LENGTH (4)
|
||||
|
||||
|
||||
|
||||
void vect_s32_example()
|
||||
{
|
||||
|
||||
printf("\n\n\n\n");
|
||||
printf("========================\n");
|
||||
printf("== vect_s32_example() ==\n");
|
||||
printf("========================\n");
|
||||
|
||||
/*
|
||||
Here we'll look at the function vect_s32_mul()
|
||||
|
||||
vect_s32_mul() is an element-wise real multiply for vectors of 32-bit elements. Its prototype
|
||||
is:
|
||||
|
||||
headroom_t vect_s32_mul(
|
||||
int32_t a[],
|
||||
const int32_t b[],
|
||||
const int32_t c[],
|
||||
const unsigned length,
|
||||
const right_shift_t b_shr,
|
||||
const right_shift_t c_shr);
|
||||
|
||||
The first three parameters are the input and output vectors. More specifically, b[] and c[] are
|
||||
input vectors and a[] is an output. All three vectors should be at least length elements long.
|
||||
The vectors must also begin at word-aligned addresses (this happens automaticaly when operating
|
||||
on an int32_t array, but may need to be explicitly specified if, for example, we're operating on
|
||||
a char array which is being recast as an int32_t array).
|
||||
|
||||
b_shr and c_shr are arithmetic right-shifts applied to elements of b[] and c[] (respectively)
|
||||
prior to multiplication. These are the levers used to simultaneously prevent
|
||||
overflows/saturation in the result and to minimize the headroom of the result (which has the
|
||||
effect of maximizing the precision of the result).
|
||||
|
||||
What values to use for b_shr and c_shr? The vect_* functions which take shift parameters as
|
||||
arguments each have an associated helper function whose name is the same as that of the operator
|
||||
function with "_prepare" at the end. So, for vect_s32_mul() the corresponding prepare function
|
||||
is vect_s32_mul_prepare().
|
||||
|
||||
void vect_s32_mul_prepare(
|
||||
exponent_t* a_exp,
|
||||
right_shift_t* b_shr,
|
||||
right_shift_t* c_shr,
|
||||
const exponent_t b_exp,
|
||||
const exponent_t c_exp,
|
||||
const headroom_t b_hr,
|
||||
const headroom_t c_hr);
|
||||
|
||||
Most of the prepare functions look similar to this. Here a_exp, b_shr and c_shr are the
|
||||
parameters to be output by this function. b_shr and c_shr are the shift parameters to be passed
|
||||
on to vect_s32_mul() and a_exp is the exponent that should be associated with the product vector
|
||||
a[]. The inputs b_exp and c_exp are the exponents associated with input vectors b[] and c[]
|
||||
respectively.
|
||||
|
||||
Exponents come into play because the prepare functions assume that a block floating-point-like
|
||||
behavior is desired. In general, the product of two 32-bit integers will not fit into a 32-bit
|
||||
value, and so the output exponent a_exp is calculated to ensure the product does not hit the
|
||||
data type's saturation bounds while preserving the most possible precision.
|
||||
|
||||
b_hr and c_hr are the headroom of vectors b[] and c[]. The headroom of a vector is the number of
|
||||
bits that its elements may be left-shifted without losing information (i.e. saturating).
|
||||
Equivalently, for vectors with signed elements, it is the minimum number of _redundant_ leading
|
||||
sign bits among the individual elements. This is important because it tells us about the range
|
||||
of values actually held in a vector of integers.
|
||||
|
||||
Returning to vect_s32_mul(), note that its return value is also of type headroom_t. The XS3 VPU
|
||||
has hardware features that allow it to track the headroom of data being processed. The returned
|
||||
value is the headroom of the output vector a[] (which may be needed for subsequent processing
|
||||
steps).
|
||||
|
||||
*/
|
||||
|
||||
// Let's start by creating 3 structs representing BFP vectors
|
||||
struct {
|
||||
int32_t WORD_ALIGNED data[LENGTH];
|
||||
exponent_t exp;
|
||||
headroom_t hr;
|
||||
} A, B, C;
|
||||
|
||||
// Note that this struct is similar to the bfp_s32_t struct, except
|
||||
// 1) it contains no length because here we're using a static length of LENGTH
|
||||
// 2) instead of a pointer to a mantissa buffer, the buffer is part of the struct.
|
||||
|
||||
// Also notice that WORD_ALIGNED was used to guarantee word-alignment of data[] (even though it
|
||||
// should not actually be necessary here because we're using int32_t).
|
||||
|
||||
|
||||
// Let's fill B[] and C[] with random values
|
||||
{
|
||||
printf("\n\n\n============ Step 1 ============\n");
|
||||
|
||||
// Random mantissas
|
||||
for(int k = 0; k < LENGTH; k++){
|
||||
B.data[k] = rand();
|
||||
C.data[k] = rand();
|
||||
}
|
||||
|
||||
//use an exponent of -31 on each, just to keep the values tractable
|
||||
B.exp = -31;
|
||||
C.exp = -31;
|
||||
|
||||
PRINT_VECT_S32(B);
|
||||
PRINT_VECT_S32(C);
|
||||
|
||||
}
|
||||
|
||||
// We don't yet know the headroom of B[] and C[], so we'll find that next
|
||||
{
|
||||
printf("\n\n\n============ Step 2 ============\n");
|
||||
|
||||
B.hr = vect_s32_headroom(B.data, LENGTH);
|
||||
C.hr = vect_s32_headroom(C.data, LENGTH);
|
||||
printf("B.hr --> %u\n", B.hr);
|
||||
printf("C.hr --> %u\n", C.hr);
|
||||
}
|
||||
|
||||
// Now we can multiply the vectors
|
||||
{
|
||||
printf("\n\n\n============ Step 3 ============\n");
|
||||
|
||||
right_shift_t b_shr;
|
||||
right_shift_t c_shr;
|
||||
vect_s32_mul_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
|
||||
|
||||
printf("A.exp --> %d (output exponent)\n", A.exp);
|
||||
printf("b_shr --> %d\n", b_shr);
|
||||
printf("c_shr --> %d\n", c_shr);
|
||||
|
||||
A.hr = vect_s32_mul(A.data, B.data, C.data, LENGTH, b_shr, c_shr);
|
||||
|
||||
PRINT_VECT_S32(A);
|
||||
printf("A.hr --> %u\n", A.hr);
|
||||
}
|
||||
|
||||
|
||||
// The code above is exactly how bfp_s32_mul() implements this behavior.
|
||||
|
||||
/*
|
||||
What if this multiplication is the last step in our BFP logic, and immediately afterwards the
|
||||
result is to be input to a function which requires the data to be in a specific Q-format? In
|
||||
this case, if we used the logic above (or bfp_s32_mul()) we may then have to add an additional
|
||||
step to shift the mantissas around to achieve the correct exponent. Let's suppose the data must
|
||||
be in a Q24 format:
|
||||
|
||||
... //continued from above
|
||||
const exponent_t required_exponent = -24;
|
||||
right_shift_t q_shr = required_exponent - A.exp;
|
||||
vect_s32_shr(A.data, A.data, LENGTH, q_shr);
|
||||
A.hr += q_shr;
|
||||
A.exp += q_shr;
|
||||
|
||||
Alternatively, you can save CPU cycles and skip that step entirely by modifying the shift
|
||||
parameters b_shr and c_shr.
|
||||
|
||||
*/
|
||||
{
|
||||
printf("\n\n\n============ Step 4 ============\n");
|
||||
|
||||
right_shift_t b_shr;
|
||||
right_shift_t c_shr;
|
||||
vect_s32_mul_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
|
||||
|
||||
// This is the required exponent
|
||||
const exponent_t required_exponent = -24;
|
||||
// This is the number of bits the mantissas must be right-shifted to achieve the required
|
||||
// exponents
|
||||
const int adjustment = required_exponent - A.exp;
|
||||
|
||||
/*
|
||||
Here it becomes important that you understand the particular operation under consideration.
|
||||
This is element-wise multiplication, such that
|
||||
|
||||
A[k] * 2^(A.exp) = B[k] * 2^(B.exp) * C[k] * 2^(C.exp)
|
||||
= B[k] * C[k] * 2^(B.exp + C.exp)
|
||||
|
||||
The temptation at this point might be to add 'adjustment' to both b_shr and c_shr. But because
|
||||
multiplication of exponentials (with a common base) results in adding together their exponents,
|
||||
in order to achieve the desired output exponent, we instead need to add a _total_ of
|
||||
'adjustment' to the _sum_ b_shr + c_shr. It can all be added to one or to the other, or it can
|
||||
be split between them, which is what we'll do here.
|
||||
*/
|
||||
b_shr += (adjustment / 2);
|
||||
c_shr += (adjustment - (adjustment / 2));
|
||||
// And just for completeness we'll make sure A[]'s exponent is also adjusted
|
||||
A.exp += adjustment;
|
||||
|
||||
// The remaining steps are identical to the pure block floating-point case above.
|
||||
A.hr = vect_s32_mul(A.data, B.data, C.data, LENGTH, b_shr, c_shr);
|
||||
|
||||
PRINT_VECT_S32(A);
|
||||
// Note that the printed floating point output values are (nearly) the same as above, indicating
|
||||
// we've done this correctly.
|
||||
printf("A.hr --> %u\n", A.hr);
|
||||
|
||||
// A final note on manually specifying an output Q-format: When doing this, there is no built-in
|
||||
// protection which prevents the user from corrupting the data through saturation or underflows.
|
||||
// That protection can only come from a clear understanding of the assumptions being made about
|
||||
// the input vectors (e.g. the range of possible exponents and mantissa values), and an
|
||||
// understanding of the operator being invoked.
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Finally, in some situations we may have a priori information that the BFP operators are unable
|
||||
to take into account.
|
||||
|
||||
For example, sometimes we may have a vector of unsigned integers to multiply by another 32-bit
|
||||
vector, but the XS3 VPU does not support any unsigned multiplications. Does that mean we cannot
|
||||
do such an operation? Hardly!
|
||||
*/
|
||||
{
|
||||
printf("\n\n\n============ Step 5 ============\n");
|
||||
|
||||
// Let's suppose that we want to multiply B[] by C[], but where C[] is a vector of unsigned
|
||||
// integers.
|
||||
|
||||
// The key fact that allows this to work is that *as long as the unsigned input vector has at
|
||||
// least 1 bit of (unsigned) headroom, it can be safely treated as a signed vector*.
|
||||
|
||||
// ASIDE: "signed" vs "unsigned" headroom. Signed headroom (which is almost always what we
|
||||
// mean by "headroom" in lib_xcore_math) is the number of _redundant_ leading sign
|
||||
// bits of the vector element with the fewest sign bits. The number of redundant
|
||||
// leading sign bits is one fewer than the number of sign bits. "unsigned" headroom,
|
||||
// on the other hand, is the number of leading _zeros_ of the vector element with the
|
||||
// fewest number of leading zeros. Unsigned integers do not have sign bits, of
|
||||
// course. The signed headroom of an unsigned vector is then one less than the
|
||||
// unsigned headroom of that vector. So, this becomes a problem only when the
|
||||
// unsigned vector has 0 bits of unsigned headroom (i.e. _negative_ signed headroom).
|
||||
|
||||
// Ideally the (unsigned) headroom of C[] would already be known or is calculated (avoiding the
|
||||
// need for any adjustments if there's already 1 bit of (unsigned) headroom) but a simple
|
||||
// strategy that usually works is to just manually add 1 bit of headroom to the unsigned vector
|
||||
// as follows
|
||||
for(int k = 0; k < LENGTH; k++)
|
||||
C.data[k] = ((uint32_t)C.data[k]) >> 1;
|
||||
|
||||
C.exp += 1;
|
||||
PRINT_VECT_S32( C );
|
||||
|
||||
// Now that we've guaranteed one bit of headroom, we can apply the usual logic
|
||||
right_shift_t b_shr;
|
||||
right_shift_t c_shr;
|
||||
vect_s32_mul_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
|
||||
|
||||
printf("A.exp --> %d\n", A.exp);
|
||||
printf("b_shr --> %d\n", b_shr);
|
||||
printf("c_shr --> %d\n", c_shr);
|
||||
|
||||
A.hr = vect_s32_mul(A.data, B.data, C.data, LENGTH, b_shr, c_shr);
|
||||
PRINT_VECT_S32(A);
|
||||
printf("A.hr --> %u\n", A.hr);
|
||||
|
||||
// As a final note on this topic, because the XS3 VPU only uses signed logic, even if it outputs
|
||||
// a vector with all non-negative mantissas a sign bit must necessarily be present, and so this
|
||||
// trick should, in general, never be needed with vectors that result from lib_xcore_math
|
||||
// operators.
|
||||
}
|
||||
|
||||
}
|
||||
1
lib_xcore_math/examples/deps.cmake
Normal file
1
lib_xcore_math/examples/deps.cmake
Normal file
@@ -0,0 +1 @@
|
||||
set(APP_DEPENDENT_MODULES "lib_xcore_math")
|
||||
Reference in New Issue
Block a user