init

2025-12-11 09:43:42 +08:00
commit d8b2974133
1822 changed files with 280037 additions and 0 deletions
--- a/lib_xcore_math/examples/CMakeLists.txt
+++ b/lib_xcore_math/examples/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.21)
+include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
+project(lib_xcore_math_examples)
+
+add_subdirectory(app_bfp_demo)
+add_subdirectory(app_fft_demo)
+add_subdirectory(app_filter_demo)
+add_subdirectory(app_vect_demo)
--- a/lib_xcore_math/examples/app_bfp_demo/CMakeLists.txt
+++ b/lib_xcore_math/examples/app_bfp_demo/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.21)
+include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
+project(app_bfp_demo)
+
+set(APP_HW_TARGET XK-EVK-XU316)
+
+set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
+
+include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
+
+set(APP_COMPILER_FLAGS -Os)
+
+if(NOT BUILD_NATIVE)
+    list(APPEND APP_COMPILER_FLAGS -fxscope
+                                   -report
+                                   )
+endif()
+
+if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    list(APPEND APP_COMPILER_FLAGS -Werror
+                                   -g
+                                   )
+else()
+    list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
+                                   # Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
+                                   # See online help for details.
+                                   -D_CRT_SECURE_NO_WARNINGS
+                                   # Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
+                                   /wd5045
+                                   )
+endif()
+
+if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
+    list(APPEND APP_COMPILER_FLAGS -lm)
+endif()
+
+XMOS_REGISTER_APP()
--- a/lib_xcore_math/examples/app_bfp_demo/src/main.c
+++ b/lib_xcore_math/examples/app_bfp_demo/src/main.c
@@ -0,0 +1,247 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __XS3A__
+# include <xscope.h>
+#endif
+
+#include "xcore_math.h"
+
+void bfp_s32_example();
+
+#define RAND_SEED   0xDECAFFED
+
+int main(int argc, char** argv)
+{
+#ifdef __XS3A__
+  xscope_config_io(XSCOPE_IO_BASIC);
+#endif
+
+  // Seed the random number generator, using a constant for reproducibility
+  srand(RAND_SEED);
+
+  // This example app only demonstrates the real 32-bit BFP functions, but the other three main categories of
+  // arithmetic BFP functions, real 16-bit, complex 32-bit and complex 16-bit, all work in a very similar way.
+
+  // The slight caveat is that in a complex 32-bit BFP vector the real and imaginary parts of a given element are
+  // stored in adjacent memory locations, in a complex 16-bit BFP vector the real and imaginary parts are stored in
+  // separate buffers.
+  bfp_s32_example();
+
+  return 0;
+}
+
+
+// Prints a BFP vector both in its mantissa-exponent form and as a floating point vector. Also prints headroom.
+static void print_vector(
+    const bfp_s32_t* vect,
+    const char* name,
+    const unsigned line_num)
+{
+  printf("=== From line #%u ===\n", line_num);
+
+  // First, the raw mantissas
+  printf("%s = [", name);
+  for(unsigned int k = 0; k < vect->length; k++)
+    printf("%ld, ", (long int)vect->data[k]);
+  printf("] * 2**(%d)\n", vect->exp);
+
+  // Next, the float equivalent
+  printf("%s_float = [", name);
+  for(unsigned int k = 0; k < vect->length; k++)
+    printf("%0.07f, ", ldexp(vect->data[k], vect->exp));
+  printf("]\n");
+
+  // And the headroom
+  printf("%s headroom: %u\n\n", name, vect->hr);
+}
+
+#define PRINT_VECT(X)     print_vector(&X, #X, __LINE__)
+#define PRINT_SCALAR(X)   printf("%s = %0.07f\n\n", #X, ldexp((double) X.mant, X.exp))
+
+
+// The length (in elements) of our BFP vectors. We're using a small length to keep the text output relatively tidy.
+#define LENGTH    (4)
+
+
+
+void bfp_s32_example()
+{
+
+  printf("###################################\n");
+  printf("### 32-bit block floating-point ###\n");
+  printf("###################################\n\n");
+
+  // Let's use 3 BFP vectors, A, B and C for this example. For real 32-bit BFP vectors, we use the type bfp_s32_t.
+  bfp_s32_t A, B, C;
+
+  // The bfp_s32_t type does not allocate its own buffer for the mantissa vector (nor does the initialization function).
+  // Instead, it contains a pointer that points to the backing buffer.
+  int32_t buffer_A[LENGTH];
+  int32_t buffer_B[LENGTH];
+  int32_t buffer_C[LENGTH];
+
+  // With the space allocated, we can initialize our BFP vectors.
+  // bfp_s32_init() takes:
+  //    - A pointer to the bfp_s32_t to be initialized
+  //    - The address of the buffer which backs the vector
+  //    - The initial exponent of the vector
+  //    - The length of the vector, in elements
+  //    - a boolean indicating whether or not the headroom should be immediately calculated.
+  printf("Initializing BFP vectors A, B and C\n");
+  bfp_s32_init(&A, buffer_A, -31, LENGTH, 0);
+  bfp_s32_init(&B, buffer_B, -31, LENGTH, 0);
+  bfp_s32_init(&C, buffer_C, -31, LENGTH, 0);
+
+  // Above we specified an initial exponent of -31. For a signed 32-bit value, which can hold integers between
+  // (and including) -(2^31) and (2^31)-1, an exponent of -31 means the representable range is -1.0 to 1.0,
+  // including the former, but not the latter.
+
+  // Above we also instructed bfp_s32_init() NOT to compute the headroom. This is because for B and C we're about
+  // to randomly set the mantissas, invalidating whatever headroom was calculated. A, on the other hand, will first
+  // be used as an output vector, so its exponent, headroom and mantissa data will be clobbered.
+
+  // This is because we're going to fill them with random data next, so their current values are irrelevant.
+
+  // Fill vector's B and C with random data.
+  for(int k = 0; k < LENGTH; k++){
+    B.data[k] = rand();
+    C.data[k] = rand();
+  }
+
+  // Last thing before we print out the initial values of vectors B and C will be to force a recalculation of their
+  // headroom with this random data. A call to bfp_s32_headroom() accomplishes this.
+  printf("\nUpdating headroom of vectors B and C.\n");
+  bfp_s32_headroom(&B);
+  bfp_s32_headroom(&C);
+
+  // Now print out the initial values for B and C.
+  {
+    printf("Initial values for BFP vectors B and C:\n");
+    PRINT_VECT(B);
+    PRINT_VECT(C);
+    printf("\n\n");
+  }
+
+
+  // Let's start by applying a right-shift to the values of B using bfp_s32_shl() (with a negative shift). This isn't
+  // necessary for the calls that follow, we can at least watch what happens to the mantissas, exponent and headroom.
+  {
+    printf("A[k] <-- B[k] >> 10\n");
+    bfp_s32_shl(&A, &B, -10);
+    PRINT_VECT(A);
+    // Applying a -10 bit left-shift effectively divides the vector by a factor of 1024. It's worth pointing out that
+    // because we're doing this in block floating-point, it would have been (nearly) equivalent to simply reduce the
+    // exponent of B by 10.
+    printf("\n\n");
+  }
+
+
+  // Now let's try adding together vectors B and C element-wise.
+  {
+    printf("A[k] <-- B[k] + C[k]\n");
+    bfp_s32_add(&A, &B, &C);
+    PRINT_VECT(A);
+
+    // Note that the above call to bfp_s32_add() takes 3 arguments. The first is the BFP vector into which the result
+    // will be placed, the second and third are the two input vectors. All three bfp_s32_t's must have already been
+    // initialized (otherwise they won't point to a valid data buffer), although the mantissas, exponent and headroom
+    // of the output vector will be clobbered.
+    // Most BFP functions in this library are very similar, with the first (non-const) bfp_s32_t* being the output vector
+    // and the subsequent bfp_s32_t pointers being the input vectors.
+    printf("\n\n");
+  }
+
+
+  // In the previous step, we updated A with the sum of vector's B and C. But, if we have no need to keep the contents
+  // of B around (e.g. it's needed for a subsequent stage), we may be able to forego the memory cost of a third
+  // vector (A) just for the output by updating B in-place.
+  {
+    printf("B[k] <-- B[k] + C[k]\n");
+    bfp_s32_add(&B, &B, &C);
+    PRINT_VECT(B);
+
+    // Except where explicitly stated otherwise, the element-wise BFP functions in this library (which take a
+    // pointer to an output vector) can safely update one or the other of the input vectors in-place.
+    printf("\n\n");
+  }
+
+
+  // Element-wise multiplication works just like addition:
+  {
+    printf("A[k] <-- B[k] * C[k]\n");
+    bfp_s32_mul(&A, &B, &C);
+    PRINT_VECT(A);
+    printf("\n\n");
+  }
+
+
+
+
+  // There are several BFP functions that take only a single input vector:
+
+  // Rectify a vector so that all elements are non-negative
+  {
+    printf("B[k] <-- max{ A[k], 0 }\n");
+    bfp_s32_rect(&B, &A);
+    PRINT_VECT(B);
+    printf("\n\n");
+  }
+
+
+  // Take the square root of each element of a vector
+  {
+    printf("A[k] <-- sqrt(B[k])\n");
+    bfp_s32_sqrt(&A, &B);
+    PRINT_VECT(A);
+    printf("\n\n");
+  }
+
+
+  // There are others that take a vector and a scalar as inputs, like bfp_s32_scale()
+  {
+    float_s32_t alpha = {
+      .mant = 0xBADDA7E5, //mantissa
+      .exp = -23          //exponent
+    };
+    printf("C[k] <-- C[k] * %0.07f\n", ldexp(alpha.mant, alpha.exp));
+    bfp_s32_scale(&C, &C, alpha);
+    PRINT_VECT(C);
+    printf("\n\n");
+  }
+
+
+  // And still others that produce a scalar, rather than a vector
+
+  // Inner product of vectors B and C
+  {
+    printf("alpha <-- sum{ B[k] * C[k] }  (inner product)\n");
+    float_s64_t alpha = bfp_s32_dot(&B, &C);
+    PRINT_SCALAR(alpha);
+    printf("\n\n");
+  }
+
+
+  // Maximum element of a BFP vector
+  {
+    printf("alpha <-- max{ B[0], B[1], ... B[LENGTH-1] }\n");
+    float_s32_t alpha = bfp_s32_max(&B);
+    PRINT_SCALAR(alpha);
+    printf("\n\n");
+  }
+
+
+  // Mean value of a BFP vector
+  {
+    printf("alpha <-- mean{ A[0], A[1], ..., A[LENGTH-1] }\n");
+    float_s32_t alpha = bfp_s32_mean(&A);
+    PRINT_SCALAR(alpha);
+    printf("\n\n");
+  }
+
+
+}
--- a/lib_xcore_math/examples/app_fft_demo/CMakeLists.txt
+++ b/lib_xcore_math/examples/app_fft_demo/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.21)
+include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
+project(app_fft_demo)
+
+if(DEFINED XCORE_TARGET)
+    set(APP_HW_TARGET ${XCORE_TARGET})
+else()
+    set(APP_HW_TARGET XK-EVK-XU316)
+endif()
+
+set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
+
+include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
+
+set(APP_COMPILER_FLAGS -Os)
+
+if(NOT BUILD_NATIVE)
+    list(APPEND APP_COMPILER_FLAGS -fxscope
+                                   -report
+                                   )
+endif()
+
+if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    list(APPEND APP_COMPILER_FLAGS -Werror
+                                   -g
+                                   )
+else()
+    list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
+                                   # Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
+                                   # See online help for details.
+                                   -D_CRT_SECURE_NO_WARNINGS
+                                   # Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
+                                   /wd5045
+                                   )
+endif()
+
+if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
+    list(APPEND APP_COMPILER_FLAGS -lm)
+endif()
+
+XMOS_REGISTER_APP()
--- a/lib_xcore_math/examples/app_fft_demo/src/main.c
+++ b/lib_xcore_math/examples/app_fft_demo/src/main.c
@@ -0,0 +1,453 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __XS3A__
+# include <xscope.h>
+#endif
+
+#include "xcore_math.h"
+
+void fft_mono_example();
+void fft_stereo_example();
+void fft_complex_example();
+
+#define RAND_SEED   3457345734
+
+int main(int argc, char** argv)
+{
+#ifdef __XS3A__
+  xscope_config_io(XSCOPE_IO_BASIC);
+#endif
+
+  // Seed the random number generator, using a constant for reproducibility
+  srand(RAND_SEED);
+
+  fft_mono_example();
+  fft_stereo_example();
+  fft_complex_example();
+
+  return 0;
+}
+
+
+// The length (in elements) of our BFP vectors.
+#define LENGTH    (128)
+
+
+void fft_mono_example()
+{
+
+  printf("\n\n\n");
+  printf("################\n");
+  printf("### Mono FFT ###\n");
+  printf("################\n\n");
+
+  /*
+    This function demonstrates how apply the forward and inverse FFT to a single channel of
+    real data.  This is accomplished using the bfp_fft_forward_mono() and bfp_fft_inverse_mono()
+    functions respectively.
+
+    bfp_s32_t         --bfp_fft_forward_mono()--> bfp_complex_s32_t
+    bfp_complex_s32_t --bfp_fft_inverse_mono()--> bfp_s32_t
+  */
+
+  // bfp_fft_forward_mono() requires an input BFP vector of type bfp_s32_t, whose mantissa vector
+  // is backed by an int32_t array.
+  // The +2 below is so spectrum unpacking can be demonstrated (more on this below).
+  int32_t buffer[LENGTH+2];
+
+  // We'll just fill that vector with random mantissas.
+  for(int k = 0; k < LENGTH; k++){
+    buffer[k] = rand() << 1;
+  }
+
+  // Before doing the forward FFT the input vector needs to be initialized. In many situations there
+  // is no obvious natural exponent to associate with the time-domain signal (such as PCM audio
+  // streams). In such a situation, it is often convenient to select an exponent that normalizes the
+  // data to a known range, such as [-1.0, 1.0). For 32-bit signed data, an exponent of -31 does
+  // that.
+  // Note that the final parameter is instructing the init function to compute the headroom of the
+  // input vector. If we instead chose to fill buffer[] with random data *after* initializing x,
+  // there would be no point to computing the headroom here, as it is invalidated the moment we
+  // modify x.data[].
+  bfp_s32_t x;
+  bfp_s32_init(&x, buffer, -31, LENGTH, 1);
+
+  // Print out the floating point equivalent values of the input vector's elements prior to applying
+  // the FFT.
+  printf("x = [");
+  for(unsigned int k = 0; k < x.length; k++)
+    printf("%0.04f, ", ldexp(x.data[k], x.exp));
+  printf("]\n\n");
+
+  // bfp_fft_forward_mono() operates on data in-place. We'll print out the buffer address before
+  // and after the transformation to convince ourselves of this.
+#ifdef __xcore__
+  printf("&x.data[0] --> 0x%08X\n", (unsigned) &x.data[0]);
+#endif
+  printf("x.length --> %u\n\n", x.length);
+
+
+  // Apply the FFT.
+  // This function takes a pointer to the input (time-domain, usually) BFP vector, and returns
+  // a pointer *to the same address* recast to a complex BFP vector.
+  bfp_complex_s32_t* X = bfp_fft_forward_mono(&x);
+
+  /*
+    There are two important things to be aware of with the mono FFT.
+
+    1) The complex output vector X is only half the length of the input vector.
+    2) The real part of the Nyquist rate component of the FFT is packed into the first element of X.
+
+    The DFT of an N-element signal is periodic with period N. Additionally, the DFT of a real
+    signal has a complex conjugate symmetry about angular frequency 0. Whereas a fully complex
+    DFT requires N complex elements to fully specify the signal, the symmetry property of a
+    real DFT means that only N/2 complex elements are needed.
+
+    A consequence of the periodicity and conjugate symmetry properties taken together is that the
+    components of the DFT at DC and at the Nyquist rate must have 0 imaginary part. Rather than
+    require the user to supply an output vector with space allocated for N/2+1 complex elements,
+    we opt to compute the DFT in-place, and pack the Nyquist rate component into the imaginary
+    part of the DC component.
+
+    If the user wishes, she can allocate space for 2 extra int32_t elements (same as 1 complex_s32_t
+    element) in the time-domain buffer (but *not* identified in the length of the time-domain BFP
+    vector, as that must be a power of 2), and unpack the Nyquist component after the FFT is applied
+    (making sure to modify the frequency-domain BFP vectors length by adding 1). Note that
+    bfp_fft_inverse_mono() however expects the data to be packed as bfp_fft_forward_mono() left it.
+
+    Note also that the inverse mono FFT also assumes the conjugate symmetry of the frequency
+    spectrum, meaning that the inverse transform is guaranteed to yield a purely real signal.
+  */
+
+#define UNPACK_SPECTRUM_MONO    1
+
+  // Unpack the mono spectrum
+  if(UNPACK_SPECTRUM_MONO) {
+    bfp_fft_unpack_mono(X);
+  }
+
+  // Print out the buffer address and vector length of X, the frequency-domain signal
+#ifdef __xcore__
+  printf("&X->data[0] --> 0x%08X\n", (unsigned) &X->data[0]);
+#endif
+  printf("X->length --> %u\n\n", X->length);
+
+  // Print out the complex BFP vector X
+  printf("X = [");
+  for(unsigned int k = 0; k < X->length; k++)
+    printf("(%0.04f + %0.04fj), ", ldexp(X->data[k].re, X->exp), ldexp(X->data[k].im, X->exp) );
+  printf("]\n\n");
+
+  ///////////////////////////////
+  // ...
+  // Here you are free to do any required frequency-domain processing of the signal.
+  // ...
+  ///////////////////////////////
+
+  // Re-pack the mono spectrum
+  if(UNPACK_SPECTRUM_MONO) {
+    bfp_fft_pack_mono(X);
+  }
+
+  // Apply the inverse FFT.
+  // Like the forward mono FFT, this function returns a pointer, this time a bfp_s32_t*, which is
+  // the function's input parameter recast. In this case we can ignore the returned value, as it
+  // would simply point to x, which is already of the correct type and readily available. In other
+  // situations (e.g. when x exists in a different scope) the returned value may be useful.
+  bfp_fft_inverse_mono(X);
+
+  // Finally, print out the inverse transformed signal, which should match the original signal to
+  // within the arithmetic precision of the forward and inverse transform pair.
+  printf("x = [");
+  for(unsigned int k = 0; k < x.length; k++)
+    printf("%0.04f, ", ldexp(x.data[k], x.exp));
+  printf("]\n\n");
+
+
+}
+
+
+void fft_stereo_example()
+{
+
+  printf("\n\n\n");
+  printf("##################\n");
+  printf("### Stereo FFT ###\n");
+  printf("##################\n\n");
+
+  /*
+    This function demonstrates how apply the forward and inverse FFT to a pair of channels, each
+    containing a sequence of real sample data.  This is accomplished using the
+    bfp_fft_forward_stereo() and bfp_fft_inverse_stereo() functions respectively.
+
+    2 x bfp_s32_t     --bfp_fft_forward_stereo()--> 2 x bfp_complex_s32_t
+    2 x bfp_complex_s32_t --bfp_fft_inverse_stereo()--> 2 x bfp_s32_t
+  */
+
+  // bfp_fft_forward_stereo() requires 2 input BFP vectors of type bfp_s32_t, whose  mantissa
+  // vectors are backed by a int32_t arrays. The +1 is because we're going to do some unpacking of
+  // the spectra after applying the FFT. See fft_mono_example() for unpacking explanation.
+  // The DWORD_ALIGNED qualifier instructs the compiler to ensure the arrays start at an
+  // 8-byte-aligned memory offset.
+  int32_t DWORD_ALIGNED bufferA[LENGTH + 1];
+  int32_t DWORD_ALIGNED bufferB[LENGTH + 1];
+
+  // bfp_fft_forward_stereo() and bfp_fft_inverse_stereo() also require a scratch buffer with
+  // element type complex_s32_t which is the same length (in elements) of the input vectors.
+  complex_s32_t DWORD_ALIGNED scratch[LENGTH];
+
+  // Fill in the buffers with random mantissas (left shift is to ensure some are negative).
+  for(int k = 0; k < LENGTH; k++){
+    bufferA[k] = rand() << 1;
+    bufferB[k] = rand() << 1;
+  }
+
+  // Before doing the forward FFT the array needs to be turned into a proper BFP vector. So we
+  // initialize a pair of bfp_s32_t structs.
+  //
+  // In many situations there is no obvious natural exponent to associate with the time-domain
+  // signal (such as PCM audio streams). In such a situation, it is often convenient to select an
+  // exponent that normalizes the data to a known range, such as [-1.0, 1.0). For 32-bit signed
+  // data, an exponent of -31 does that.
+  //
+  // Note that the final parameter is instructing the init function to compute the headroom of the
+  // input vector. If we instead chose to fill buffer[] with random data *after* initializing x,
+  // there would be no point to computing the headroom here, as it is invalidated the moment we
+  // modify a.data[] or b.data[].
+  bfp_s32_t a, b;
+  bfp_s32_init(&a, bufferA, -31, LENGTH, 1);
+  bfp_s32_init(&b, bufferB, -31, LENGTH, 1);
+
+  // Print out the floating point equivalent values of the input vectors' elements prior to applying
+  // the FFT.
+  printf("a = [");
+  for(unsigned int k = 0; k < a.length; k++)
+    printf("%0.04f, ", ldexp(a.data[k], a.exp));
+  printf("]\n\n");
+
+  printf("b = [");
+  for(unsigned int k = 0; k < b.length; k++)
+    printf("%0.04f, ", ldexp(b.data[k], b.exp));
+  printf("]\n\n");
+
+  // Apply the FFT.
+  //
+  // This function takes a pointer to the input BFP vectors and a scratch buffer
+  bfp_fft_forward_stereo(&a, &b, scratch);
+
+  // The two bfp_s32_t vectors containing the time-domain data have now been clobbered with the
+  // frequency-domain data, and should not be directly accessed. Instead, we alias each bfp_s32_t to
+  // a bfp_complex_s32_t and access the results through that.
+  bfp_complex_s32_t* ChA = (bfp_complex_s32_t*) &a;
+  bfp_complex_s32_t* ChB = (bfp_complex_s32_t*) &b;
+
+
+  /*
+    See the note above (in fft_mono_example()) about the properties of the real DFT that allow it to
+    be computed in-place. Much of the same applies for the stereo DFT.
+
+    It is another property of the DFT that the real and imaginary parts of the input domain are
+    transformed in a way which allows two purely real signals to be DFTed simultaneously and their
+    spectra to be fully separated afterwards. This is done by bfp_fft_forward_stereo().
+
+    Like the mono FFT, each output spectrum (channels A and B) is represented by only N/2 complex
+    elements. This again is allowed as a result of the assumption that the input signals are purely
+    real. Also like the mono FFT, the real part of the Nyquist rate is packed into the imaginary
+    part of the DC element for each of the spectra.
+
+    The spectrum for each channel can be unpacked using bfp_fft_unpack_mono(), just as with
+    bfp_fft_forward_mono().
+
+    If much computation is to be done in the frequency domain this unpacking is likely to be
+    relatively inexpensive and to reduce the likelihood of errors (e.g. complex multiplication of
+    the packed spectrum will likely produce undesired effects).
+
+    Before performing the inverse FFT, the signal must be re-packed with bfp_fft_pack_mono().
+
+  */
+
+#define UNPACK_SPECTRA_STEREO    1
+
+  // Unpack the spectra
+  if(UNPACK_SPECTRA_STEREO) {
+    bfp_fft_unpack_mono(ChA);
+    bfp_fft_unpack_mono(ChB);
+  }
+
+
+  // Print out the lengths of the channel A and channel B frequency spectra.
+  printf("ChA.length --> %u\n\n", ChA->length);
+  printf("ChB.length --> %u\n\n", ChB->length);
+
+  // Print out the floating-point equivalent of the channel A and B frequency spectra.
+  printf("ChA = [");
+  for(unsigned int k = 0; k < ChA->length; k++)
+    printf("(%0.04f + %0.04fj), ", ldexp(ChA->data[k].re, ChA->exp), ldexp(ChA->data[k].im, ChA->exp) );
+  printf("]\n\n");
+
+  printf("ChB = [");
+  for(unsigned int k = 0; k < ChB->length; k++)
+    printf("(%0.04f + %0.04fj), ", ldexp(ChB->data[k].re, ChB->exp), ldexp(ChB->data[k].im, ChB->exp) );
+  printf("]\n\n");
+
+  ///////////////////////////////
+  // ...
+  // Here you are free to do any required frequency-domain processing of the signal.
+  // ...
+  ///////////////////////////////
+
+
+  // Repack the spectra
+  if(UNPACK_SPECTRA_STEREO) {
+    bfp_fft_pack_mono(ChA);
+    bfp_fft_pack_mono(ChB);
+  }
+
+  // Apply the inverse FFT.
+  // This function behaves much like the forward stereo FFT, with the input and output parameters
+  // reversed.
+  bfp_fft_inverse_stereo(ChA, ChB, scratch);
+
+  // ChA and ChB should now be considered clobbered, and `a` and `b` can be used to access the
+  // time-domain data.
+
+  // Finally, print out the inverse transformed signal, which should match the original signal to
+  // within the arithmetic precision of the forward and inverse transform pair.
+  printf("a = [");
+  for(unsigned int k = 0; k < a.length; k++)
+    printf("%0.04f, ", ldexp(a.data[k], a.exp));
+  printf("]\n\n");
+
+  printf("b = [");
+  for(unsigned int k = 0; k < b.length; k++)
+    printf("%0.04f, ", ldexp(b.data[k], b.exp));
+  printf("]\n\n");
+
+}
+
+
+void fft_complex_example()
+{
+
+  printf("\n\n\n");
+  printf("###################\n");
+  printf("### Complex FFT ###\n");
+  printf("###################\n\n");
+
+  /*
+    This function demonstrates how apply the forward and inverse FFT to single channel containing
+    containing a sequence of complex sample data.  This is accomplished using the
+    bfp_fft_forward_complex() and bfp_fft_inverse_complex() functions respectively.
+
+    bfp_complex_s32_t --bfp_fft_forward_complex()--> bfp_complex_s32_t
+    bfp_complex_s32_t --bfp_fft_inverse_complex()--> bfp_complex_s32_t
+  */
+
+  // bfp_fft_forward_complex() requires an input BFP vector of type bfp_complex_s32_t, whose
+  // mantissa vector is backed by a complex_s32_t array.
+  // The complex_s32_t struct contains 2 fields, `re` for the real part and `im` for the
+  // imaginary part of an element.  The buffer is then an alternating sequence of real and
+  // imaginary parts for successive elements.
+  complex_s32_t buffer[LENGTH];
+
+  // Fill in the buffer with random mantissas (left shift is to ensure some are negative).
+  for(int k = 0; k < LENGTH; k++){
+    buffer[k].re = rand() << 1;
+    buffer[k].im = rand() << 1;
+  }
+
+  // Before doing the forward FFT the array needs to be turned into a proper BFP vector. So we
+  // initialize a bfp_complex_s32_t with bfp_complex_s32_init().
+  // In many situations there is no obvious natural exponent to associate with the time-domain
+  // signal. In such a situation, it is often convenient to select an exponent that normalizes
+  // the data to a known range, such as [-1.0, 1.0). For 32-bit signed data, an exponent of -31
+  // does that.
+  // Alternatively, initializing with an exponent of 0 has the advantage that, after having
+  // applied the inverse transform, the resulting exponent indicates the overall level of scaling
+  // applied throughout the process.
+  // Note that the final parameter is instructing the init function to compute the headroom of the
+  // input vector. If we instead chose to fill buffer[] with random data *after* initializing x,
+  // there would be no point to computing the headroom here, as it is invalidated the moment we
+  // modify x.data[].
+  bfp_complex_s32_t x;
+  bfp_complex_s32_init(&x, buffer, -31, LENGTH, 1);
+
+  // Print out the floating point equivalent values of the input vector's elements prior to applying
+  // the FFT.
+  printf("x = [");
+  for(unsigned int k = 0; k < x.length; k++)
+    printf("%0.04f + %0.04fj, ", ldexp(x.data[k].re, x.exp), ldexp(x.data[k].im, x.exp));
+  printf("]\n\n");
+
+  // bfp_fft_forward_complex() operates on data in-place. We'll print out the buffer address before
+  // and after the transformation to convince ourselves of this.
+#ifdef __xcore__
+  printf("&x.data[0] --> 0x%08X\n", (unsigned) &x.data[0]);
+#endif
+  printf("x.length --> %u\n\n", x.length);
+
+  // Apply the FFT.
+  // Unlike the mono and stereo FFTs, the input and output vector types for the complex FFT are the
+  // the same, so no additional structs or type casts are required.
+  bfp_fft_forward_complex(&x);
+
+  /*
+    Unlike bfp_fft_forward_mono() and bfp_fft_forward_stereo(), the input signal for
+    bfp_fft_forward_complex() is not purely real. As such, a full FFT_N complex elements are
+    required to represent the spectrum.
+
+    Additionally there is no packing of the Nyquist component's real part into the DC component's
+    imaginary part.
+
+    bfp_fft_forward_complex() can be used to perform a DFT on a single channel of purely real data
+    by simply placing that signal in the real parts of a complex vector, with each element's
+    imaginary part being zero. It's worth noting a few differences between doing this and just using
+    bfp_fft_forward_mono().
+
+    1) The complex input vector requires twice as much memory. For long FFTs, this may be
+       appreciable.
+    2) The cycles taken for bfp_fft_forward_mono() to compute an FFT_N-point DFT is approximately the
+       same as the cycles taken to perform an FFT_N/2-point DFT using bfp_fft_forward_complex().
+    3) The complex FFT does not result in a packed signal.
+  */
+
+  // ... howevr, we'll just create a new pointer to the spectrum to clarify when we're operating
+  // in the time domain when we're in the frequency domain (`X` being frequency domain)
+  bfp_complex_s32_t* X = &x;
+
+  // Print out the address and length of the complex frequency spectrum
+#ifdef __xcore__
+  printf("&X->data[0] --> 0x%08X\n", (unsigned) &X->data[0]);
+#endif
+  printf("X->length --> %u\n\n", X->length);
+
+  // Print out the floating-point equivalent of the complex frequency spectrum
+  printf("X = [");
+  for(unsigned int k = 0; k < X->length; k++)
+    printf("(%0.04f + %0.04fj), ", ldexp(X->data[k].re, X->exp), ldexp(X->data[k].im, X->exp) );
+  printf("]\n\n");
+
+  ///////////////////////////////
+  // ...
+  // Here you are free to do any required frequency-domain processing of the signal.
+  // ...
+  ///////////////////////////////
+
+  // Apply the inverse FFT
+  bfp_fft_inverse_complex(X);
+
+  // Finally, print out the inverse transformed signal, which should match the original signal to
+  // within the arithmetic precision of the forward and inverse transform pair.
+  printf("x = [");
+  for(unsigned int k = 0; k < x.length; k++)
+    printf("%0.04f + %0.04fj, ", ldexp(x.data[k].re, x.exp), ldexp(x.data[k].im, x.exp));
+  printf("]\n\n");
+
+
+}
--- a/lib_xcore_math/examples/app_filter_demo/CMakeLists.txt
+++ b/lib_xcore_math/examples/app_filter_demo/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.21)
+include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
+project(app_filter_demo)
+
+if(DEFINED XCORE_TARGET)
+    set(APP_HW_TARGET ${XCORE_TARGET})
+else()
+    set(APP_HW_TARGET XK-EVK-XU316)
+endif()
+
+set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
+
+include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
+
+set(APP_COMPILER_FLAGS -Os)
+
+if(NOT BUILD_NATIVE)
+    list(APPEND APP_COMPILER_FLAGS -fxscope
+                                   -report
+                                   )
+endif()
+
+if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    list(APPEND APP_COMPILER_FLAGS -Werror
+                                   -g
+                                   )
+else()
+    list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
+                                   # Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
+                                   # See online help for details.
+                                   -D_CRT_SECURE_NO_WARNINGS
+                                   # Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
+                                   /wd5045
+                                   )
+endif()
+
+if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
+    list(APPEND APP_COMPILER_FLAGS -lm)
+endif()
+
+XMOS_REGISTER_APP()
--- a/lib_xcore_math/examples/app_filter_demo/src/filter_16bit_fir.c
+++ b/lib_xcore_math/examples/app_filter_demo/src/filter_16bit_fir.c
@@ -0,0 +1,174 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xcore_math.h"
+
+
+#define TAP_COUNT     (35)
+
+#define FILTER_TICKS  (50)
+
+
+void filter_16bit_fir()
+{
+  printf("\n\n\n\n");
+  printf("==========================\n");
+  printf("== 16-bit FIR Filtering ==\n");
+  printf("==========================\n");
+
+  /**
+   * In this example we'll look at how to use the 16-bit FIR filtering functions.
+   *
+   * The applied filter logic is approximately:
+   *
+   *    output[n] = sum_k{ input[n-k] * b[k] } >> shift
+   *    where
+   *      > output[n] is the filter output at time n
+   *      > input[n-k] is the filter input k time steps before time n
+   *      > b[k] is the user-supplied filter coefficient associated with time lag k
+   *          (i.e. larger k is applied to older samples)
+   *      > shift is a user-specified power-of-two scale factor
+   *      > sum_k{} is a sum over variable k with   0 < k < N_taps, and
+   *      > N_taps is the number of filter taps.
+   *
+   * The multiplication inside the sum is a full 32-bit multiplication.  The maximum value of
+   * any such product is then
+   *   ((-2^15)*(-2^15)) = 2^(30).  These (effectively 31-bit) products are accumulated into a
+   * 32-bit saturating accumulator. Finally, a rounding, saturating right-shift of `shift` bits
+   * is applied to the 32-bit sum to get the final result which is output.  The final shift
+   * saturates to the symmetric 16-bit bounds of INT16_MAX and -INT16_MAX.  (Note that the
+   * 32-bit accumulator saturates to symmetric 32-bit bounds.)
+   *
+   * Below we'll declare and initialize a 16-bit FIR filter with TAP_COUNT random coefficients.
+   * We'll then iterate for TAP_COUNT steps, adding samples to the filter without actually computing
+   * the filter output. Finally, we'll iterate another FILTER_TICKS steps, adding random samples to
+   * the filter and calculating the outputs.
+   */
+
+  /**
+   * Declare the required filter struct, filter_fir_s16_t.
+   *
+   * If your application requires multiple 16-bit FIR filters, one of these should be allocated
+   * for each such filter.
+   */
+  filter_fir_s16_t filter;
+
+  /**
+   * Allocate buffers for the filter coefficients and the filter state.
+   *
+   * To initialize the filter, the init function needs to be provided with the filter coefficients.
+   * In practice, if filter coefficients are known in advance, these can be declared as read-only
+   * memory. They will not be modified by the filter API.
+   *
+   * The filter API uses a circular buffer to mangage the filter's state for the user. To do this,
+   * it requires a user-supplied buffer in which to place the historical sample data.
+   *
+   * Both coefficient and sample buffers must be larger enough to contain TAP_COUNT int16_t
+   * elements.
+   *
+   * Additionally, both buffers must begin at a 4-byte- (word-) aligned address. This alignment
+   * should be manually specified if the buffer was originally declared as a char array, for
+   * example.
+   */
+  int16_t WORD_ALIGNED coefficients[TAP_COUNT] = {0};
+  int16_t WORD_ALIGNED sample_buffer[TAP_COUNT] = {0};
+
+  /**
+   * How to choose the final shift factor for a filter is beyond the scope of this example, but
+   * here a final scale of 15 bits seems to work correctly. Note that this shift is unsigned
+   * (i.e. negative values are treated as large right-shifts; they will not left-shift).
+   */
+  right_shift_t filter_shr = 15;
+
+  /**
+   * Choose some random coefficients
+   *
+   * We have to be more careful here selecting coefficients than we were with the 32-bit FIR filter.
+   * The difference is that the 32-bit FIR filter has a built-in 30-bit right-shift when the product
+   * of the coefficients and samples are computed, which substantially limits how quickly the
+   * accumulators are able to fill up.  With the 16-bit FIR filters, the internal accumulator can
+   * only store 32-bits.
+   *
+   * The largest product of two 16-bit numbers is  (-2^15 * -2^15) = 2^30. The largest supported
+   * accumulator value is approximately 2^31. That means that in the worst case it takes only two
+   * taps to hit saturation with a 16-bit FIR filter.
+   *
+   * Instead, we want the absolute sum of coefficients to be at most 2^16 ( 2^15 * 2^16 = 2^31 ).
+   * So, we'll first pick random coefficients, then we'll normalize them to ensure they don't
+   * exceed that.
+   */
+  if(1){
+    int64_t coef_sum = 0;
+    for(int lag = 0; lag < TAP_COUNT; lag++){
+      coefficients[lag] = (int16_t) rand();
+      coef_sum += abs(coefficients[lag]);
+    }
+
+    double norm = ldexp(1.0, 16) / ldexp((double) coef_sum, 0);
+
+    for(int lag = 0; lag < TAP_COUNT; lag++){
+      coefficients[lag] = (int16_t) (coefficients[lag] * norm);
+    }
+  }
+
+  /**
+   * Initialize filter with filter_fir_s16_init().
+   *
+   * The parameters here are straight-forward, given the description of filter behavior
+   * above.
+   */
+  filter_fir_s16_init(&filter, sample_buffer, TAP_COUNT, coefficients, filter_shr);
+
+  // Print out the filter coefficients
+  printf("Filter Coefficients:\n");
+
+  printf("b = [");
+  for(int k = 0; k < TAP_COUNT; k++){
+    printf("% 7d, ", filter.coef[k]);
+    if(k % 8 == 7) printf("\n     ");
+  }
+  printf("]\n");
+
+  printf("filter_shr = %d\n\n", filter.shift);
+
+
+  // Print the equivalent floating-point FIR filter coefficients
+  printf("b_float = [");
+  for(int k = 0; k < TAP_COUNT; k++){
+    printf("%f, ", ldexp(filter.coef[k], -filter.shift));
+    if(k % 6 == 5) printf("\n           ");
+  }
+  printf("]\n\n");
+
+  /**
+   * Using filter_s16_add_sample(), supply the filter with random input samples without actually
+   * computing any filtered output.
+   *
+   * This may be useful, for example, to quickly fill up a filters state with inputs, without
+   * incurring the cost to compute the filtered output for each sample.
+   */
+  for(int k = -TAP_COUNT; k < 0; k++) {
+    int16_t input_sample = (int16_t) rand();
+    filter_fir_s16_add_sample(&filter, input_sample);
+
+    printf("input[% 4d] = % 7d;  output[% 4d] =      (N/A)\n", k, input_sample, k);
+  }
+  printf("\n");
+
+  /**
+   * Using filter_fir_s16(), supply the filter with new random input samples, computing the filtered
+   * output value each time.
+   */
+  for(int k = 0; k < FILTER_TICKS; k++){
+    int16_t input_sample = (int16_t) rand();
+    int16_t output_sample = filter_fir_s16(&filter, input_sample);
+
+    printf("input[% 4d] = % 7d;  output[% 4d] = % 7d\n", k, input_sample, k, output_sample);
+  }
+
+
+}
--- a/lib_xcore_math/examples/app_filter_demo/src/filter_32bit_biquad.c
+++ b/lib_xcore_math/examples/app_filter_demo/src/filter_32bit_biquad.c
@@ -0,0 +1,95 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xcore_math.h"
+
+// Each filter_biquad_s32_t can store (up to) 8 biquad filter sections
+#define SECTION_COUNT   8
+#define FILTER_TICKS    120
+
+/**
+ * Biquad filter used in this example.
+ *
+ * Unlike the 16- and 32-bit FIR filters, 32-bit biquad filters store their state and coefficients
+ * within the filter struct itself.
+ *
+ */
+filter_biquad_s32_t filter = {
+  // Number of biquad sections in this filter block
+  .biquad_count = SECTION_COUNT,
+
+  // Filter state, initialized to 0
+  .state = {{0}},
+
+  // Filter coefficients
+  .coef = {
+    { 0x160cc0a,  0x28aa355,  0x323dae1,  0x1f12a2a,   0x32fc6d,  0x9a142d6,  0xc88bad1,  0x541f6bf },
+    { 0xb4007f5,  0x4e11ec,   0x362dabe,  0x9bfb667,   0x405747,  0x6d593cd,  0xae295e3,  0xac62779 },
+    { 0xb5a000d,  0xc84d2fc,  0xa20394f,   0xc9bb57,  0x454e970,  0x912f78e,   0xcf4303,  0x900fdb2 },
+    {-0x574fcb8, -0x6c18c97, -0x2228d4a, -0x6ab3e49,  -0xe9532b, -0x2876ab2, -0x2d4d1e7, -0x11aaecb },
+    {-0x4329901, -0x19826ba, -0x7b349e9, -0x7b0ec2e, -0x96e1eb2,  -0xbe6675, -0x92fa8be, -0x9cf08a7 }
+  }
+};
+
+void filter_32bit_biquad()
+{
+  printf("\n\n\n\n");
+  printf("=============================\n");
+  printf("== 32-bit Biquad Filtering ==\n");
+  printf("=============================\n");
+
+  /**
+   * In this example, we'll look at how to use the 32-bit biquad filtering functions.
+   *
+   * 32-bit biquad filters use the filter_biquad_s32_t type. Unlike the 16- and 32-bit FIR
+   * filters, the 32-bit biquad filter type carries its own data and need not be initialized.
+   * (see above)
+   *
+   * The coefficients of the 32-bit biquad filter are encoded as Q2.30 fixed-point values. If x[n]
+   * is the input to a biquad section at time n, and y[n] is its output, then the relationship
+   * between y[n] and x[n] for a single biquad filter section is:
+   *
+   *    y[n] = ((b0 * x[n  ]) >> 30)
+   *         + ((b1 * x[n-1]) >> 30)
+   *         + ((b2 * x[n-2]) >> 30)
+   *         - ((a1 * y[n-1]) >> 30)
+   *         - ((a2 * y[n-2]) >> 30)
+   *
+   *  where the right-shifts are rounding, and b0, b1, b2, -a1, and -a2 are `filter.coef[0][k]`,
+   * `filter.coef[1][k]`, `filter.coef[2][k]`, `filter.coef[3][k]`, and `filter.coef[4][k]`
+   *  respectively, for the kth biquad section. The output y[n] is then also the input to the
+   *  (k+1)th biquad section.
+   *
+   * A single filter_biquad_s32_t stores coefficients and state information for up to 8 biquad
+   * sections (the optimal block size for computation). If more than 8 biquad sections are required,
+   * an array of filter_biquad_s32_t structs can be used, and the filter can be executed using
+   * filter_biquads_s32() instead of filter_biquad_s32().
+   */
+
+  // Print out the coefficients for each biquad filter section
+  for(unsigned int section = 0; section < filter.biquad_count; section++){
+    printf("Biquad Section[%d]\n", section);
+    printf("  -a1 = %f\n", ldexp(filter.coef[3][section], -30));
+    printf("  -a2 = %f\n", ldexp(filter.coef[4][section], -30));
+    printf("   b0 = %f\n", ldexp(filter.coef[0][section], -30));
+    printf("   b1 = %f\n", ldexp(filter.coef[1][section], -30));
+    printf("   b2 = %f\n", ldexp(filter.coef[2][section], -30));
+  }
+
+
+  /**
+   * Process random samples.
+   * Because we're only using 1 filter block here, we use filter_biquad_s32().
+   */
+  for(int k = 0; k < FILTER_TICKS; k++){
+    int32_t input_sample = rand() << 1;
+    int32_t output_sample = filter_biquad_s32(&filter, input_sample);
+
+    printf("input[% 4d] = % 13ld;  output[% 4d] = % 13ld\n", k, (long int) input_sample, k, (long int) output_sample);
+  }
+
+}
--- a/lib_xcore_math/examples/app_filter_demo/src/filter_32bit_fir.c
+++ b/lib_xcore_math/examples/app_filter_demo/src/filter_32bit_fir.c
@@ -0,0 +1,152 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xcore_math.h"
+
+
+#define TAP_COUNT     (35)
+
+#define FILTER_TICKS  (50)
+
+
+void filter_32bit_fir()
+{
+  printf("\n\n\n\n");
+  printf("==========================\n");
+  printf("== 32-bit FIR Filtering ==\n");
+  printf("==========================\n");
+
+  /**
+   * In this example we'll look at how to use the 32-bit FIR filtering functions.
+   *
+   * The applied filter logic is approximately:
+   *
+   *    output[n] = sum_k{ (input[n-k] * b[k]) >> 30 } >> shift
+   *    where
+   *      > output[n] is the filter output at time n
+   *      > input[n-k] is the filter input k time steps before time n
+   *      > b[k] is the user-supplied filter coefficient associated with time lag k
+   *          (i.e. larger k is applied to older samples)
+   *      > shift is a user-specified power-of-two scale factor
+   *      > sum_k{} is a sum over variable k with   0 < k < N_taps, and
+   *      > N_taps is the number of filter taps.
+   *
+   * The multiplication inside the sum is a full 64-bit multiplication with a built-in 30-bit
+   * rounding right-shift (as shown).  The maximum value of any such product is then
+   *   ((-2^31)*(-2^31)*(2^(-30))) = 2^(62-30) = 2^(32).  These (effectively 33-bit) products
+   * are accumulated into a set of 8 internal 40-bit saturating accumulators, accumulator p
+   * containing the partial sum of the filter indices p mod 8.  The 8 accumulators containing the
+   * 40-bit partial sums are then added together for an (at-most) 43-bit final sum. Finally, a
+   * rounding, saturating right-shift of `shift` bits is applied to the 43-bit sum to get the
+   * final result which is output.  The final shift saturates to the symmetric 32-bit bounds
+   * of INT32_MAX and -INT32_MAX.  (Note that each of the 40-bit accumulators saturates to
+   * symmetric 40-bit bounds.)
+   *
+   * Below we'll declare and initialize a 32-bit FIR filter with TAP_COUNT random coefficients.
+   * We'll then iterate for TAP_COUNT steps, adding samples to the filter without actually computing
+   * the filter output. Finally, we'll iterate another FILTER_TICKS steps, adding random samples to
+   * the filter and calculating the outputs.
+   */
+
+
+  /**
+   * Declare the required filter struct, filter_fir_s32_t.
+   *
+   * If your application requires multiple 32-bit FIR filters, one of these should be allocated
+   * for each such filter.
+   */
+  filter_fir_s32_t filter;
+
+  /**
+   * Allocate buffers for the filter coefficients and the filter state.
+   *
+   * To initialize the filter, the init function needs to be provided with the filter coefficients.
+   * In practice, if filter coefficients are known in advance, these can be declared as read-only
+   * memory. They will not be modified by the filter API.
+   *
+   * The filter API uses a circular buffer to mangage the filter's state for the user. To do this,
+   * it requires a user-supplied buffer in which to place the historical sample data.
+   *
+   * Both coefficient and sample buffers must be larger enough to contain TAP_COUNT int32_t
+   * elements.
+   *
+   * Additionally, both buffers must begin at a 4-byte- (word-) aligned address. This is the default
+   * behavior for int32_t arrays, but use of the WORD_ALIGNED macro is shown here for completeness.
+   * This alignment should be manually specified if the buffer was originally declared as a char
+   * array, for example.
+   */
+  int32_t WORD_ALIGNED coefficients[TAP_COUNT] = {0};
+  int32_t WORD_ALIGNED sample_buffer[TAP_COUNT] = {0};
+
+  /**
+   * How to choose the final shift factor for a filter is beyond the scope of this example, but
+   * here a final scale of 2 bits seems to work correctly. Note that this shift is unsigned
+   * (i.e. negative values are treated as large right-shifts; they will not left-shift).
+   */
+  right_shift_t filter_shr = 2;
+
+  // Choose some random coefficients
+  for(int lag = 0; lag < TAP_COUNT; lag++){
+    coefficients[lag] = rand() << 1;
+  }
+
+  /**
+   * Initialize filter with filter_fir_s32_init().
+   *
+   * The parameters here are straight-forward, given the description of filter behavior
+   * above.
+   */
+  filter_fir_s32_init(&filter, sample_buffer, TAP_COUNT, coefficients, filter_shr);
+
+  // Print out the filter coefficients
+  printf("Filter Coefficients:\n");
+
+  printf("b = [");
+  for(int k = 0; k < TAP_COUNT; k++){
+    printf("% 11ld, ", (long int) filter.coef[k]);
+    if(k % 8 == 7) printf("\n     ");
+  }
+  printf("]\n");
+
+  printf("filter_shr = %d\n\n", filter.shift);
+
+
+  // Print the equivalent floating-point FIR filter coefficients
+  printf("b_float = [");
+  for(int k = 0; k < TAP_COUNT; k++){
+    printf("%f, ", ldexp(filter.coef[k], -30-filter.shift));
+    if(k % 6 == 5) printf("\n           ");
+  }
+  printf("]\n\n");
+
+  /**
+   * Using filter_s32_add_sample(), supply the filter with random input samples without actually
+   * computing any filtered output.
+   *
+   * This may be useful, for example, to quickly fill up a filters state with inputs, without
+   * incurring the cost to compute the filtered output for each sample.
+   */
+  for(int k = -TAP_COUNT; k < 0; k++) {
+    int32_t input_sample = rand() << 1;
+    filter_fir_s32_add_sample(&filter, input_sample);
+
+    printf("input[% 4d] = % 12ld;  output[% 4d] =      (N/A)\n", k, (long int) input_sample, k);
+  }
+  printf("\n");
+
+  /**
+   * Using filter_fir_s32(), supply the filter with new random input samples, computing the filtered
+   * output value each time.
+   */
+  for(int k = 0; k < FILTER_TICKS; k++){
+    int32_t input_sample = rand() << 1;
+    int32_t output_sample = filter_fir_s32(&filter, input_sample);
+
+    printf("input[% 4d] = % 12ld;  output[% 4d] = % 12ld\n", k, (long int) input_sample, k, (long int) output_sample);
+  }
+
+}
--- a/lib_xcore_math/examples/app_filter_demo/src/main.c
+++ b/lib_xcore_math/examples/app_filter_demo/src/main.c
@@ -0,0 +1,31 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __XS3A__
+# include <xscope.h>
+#endif
+
+void filter_32bit_fir();
+void filter_16bit_fir();
+void filter_32bit_biquad();
+
+#define RAND_SEED   76465367
+
+int main(int argc, char** argv)
+{
+#ifdef __XS3A__
+  xscope_config_io(XSCOPE_IO_BASIC);
+#endif
+
+  // Seed the random number generator, using a constant for reproducibility
+  srand(RAND_SEED);
+
+  // filter_32bit_fir();
+  // filter_16bit_fir();
+  filter_32bit_biquad();
+
+  return 0;
+}
--- a/lib_xcore_math/examples/app_vect_demo/CMakeLists.txt
+++ b/lib_xcore_math/examples/app_vect_demo/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.21)
+include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
+project(app_vect_demo)
+
+if(DEFINED XCORE_TARGET)
+    set(APP_HW_TARGET ${XCORE_TARGET})
+else()
+    set(APP_HW_TARGET XK-EVK-XU316)
+endif()
+
+set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
+
+include(${CMAKE_CURRENT_LIST_DIR}/../deps.cmake)
+
+set(APP_COMPILER_FLAGS -Os)
+
+if(NOT BUILD_NATIVE)
+    list(APPEND APP_COMPILER_FLAGS -fxscope
+                                   -report
+                                   )
+endif()
+
+if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    list(APPEND APP_COMPILER_FLAGS -Werror
+                                   -g
+                                   )
+else()
+    list(APPEND APP_COMPILER_FLAGS # Suppress warning C4996: 'sprintf': This function or variable may be unsafe.
+                                   # Consider using sprintf_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS.
+                                   # See online help for details.
+                                   -D_CRT_SECURE_NO_WARNINGS
+                                   # Suppress warning C5045: Compiler will insert Spectre mitigation for memory load if /wd5045 switch specified
+                                   /wd5045
+                                   )
+endif()
+
+if(PLATFORM_ID STREQUAL Linux OR PLATFORM_ID STREQUAL Darwin)
+    list(APPEND APP_COMPILER_FLAGS -lm)
+endif()
+
+XMOS_REGISTER_APP()
--- a/lib_xcore_math/examples/app_vect_demo/src/main.c
+++ b/lib_xcore_math/examples/app_vect_demo/src/main.c
@@ -0,0 +1,29 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef __XS3A__
+# include <xscope.h>
+#endif
+
+void vect_s32_example();
+void vect_complex_s16_example();
+
+#define RAND_SEED   0xDEFACED
+
+int main(int argc, char** argv)
+{
+#ifdef __XS3A__
+  xscope_config_io(XSCOPE_IO_BASIC);
+#endif
+
+  // Seed the random number generator, using a constant for reproducibility
+  srand(RAND_SEED);
+
+  vect_s32_example();
+  vect_complex_s16_example();
+
+  return 0;
+}
--- a/lib_xcore_math/examples/app_vect_demo/src/vect_complex_s16_example.c
+++ b/lib_xcore_math/examples/app_vect_demo/src/vect_complex_s16_example.c
@@ -0,0 +1,228 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xcore_math.h"
+
+
+// Prints a BFP vector both in its mantissa-exponent form and as a floating point vector. Also prints headroom.
+static void print_vector_complex_s16(
+    const int16_t real[],
+    const int16_t imag[],
+    const char* name,
+    const exponent_t exp,
+    const unsigned length,
+    const unsigned line)
+{
+  // First, the raw mantissas
+  printf("%s = [", name);
+  for(unsigned int k = 0; k < length; k++)
+    printf("%d + j*%d, ", real[k], imag[k]);
+  printf("] * 2**(%d)\t// (line %u)\n", exp, line);
+
+  // Next, the float equivalent
+  printf("%s_float = [", name);
+  for(unsigned int k = 0; k < length; k++)
+    printf("%0.07f + j*%0.07f, ", ldexp(real[k], exp), ldexp(imag[k], exp));
+  printf("]\n");
+}
+
+#define PRINT_VECT_C16(X)     print_vector_complex_s16(X.real, X.imag, #X, X.exp, LENGTH, __LINE__)
+
+// The length (in elements) of our BFP vectors. We're using a small length to keep the text output relatively tidy.
+#define LENGTH    (4)
+
+
+
+void vect_complex_s16_example()
+{
+  printf("\n\n\n\n");
+  printf("================================\n");
+  printf("== vect_complex_s16_example() ==\n");
+  printf("================================\n");
+
+  /*
+    Here we'll look at the functin vect_complex_s16_add().
+
+    vect_complex_s16_add() is an element-wise addition of two complex 16-bit vectors.
+
+      headroom_t vect_complex_s16_add(
+          int16_t a_real[],
+          int16_t a_imag[],
+          const int16_t b_real[],
+          const int16_t b_imag[],
+          const int16_t c_real[],
+          const int16_t c_imag[],
+          const unsigned length,
+          const right_shift_t b_shr,
+          const right_shift_t c_shr);
+
+    The first six parameters are the real and imaginary parts of each of the input and output
+    vectors. Specifically, if A[] is (conceptually) the complex block floating-point output
+    vector, then a_real[] and a_imag[] store the real and imaginary parts respectively of A[]'s
+    mantissas. Likewise with complex input vectors B[] and C[]. So, the mantissa of an element
+    B[k] is (b_real[k] + j*b_imag[k]), and if the exponent of B[] is B_exp, then the rational
+    value of B[k] is  (b_real[k] + j*b_imag[k]) * 2^(B_exp).
+
+    Each of the first six parameters must be word-aligned and point to a buffer that can hold
+    at least `length` elements.
+
+    b_shr and c_shr are arithmetic right-shifts applied to the mantissas of B[] and C[]
+    respectively prior to being added. These shifts are necessary not only to prevent
+    overflow/saturation, but also because in order for adding two mantissas together to be
+    a meaningful operation, they must correspond to the same exponent. For example, if we
+    two rational values (1 * 10^0) and (2 * 10^-1), each representing the number 1.0,
+    if we simply add together that mantissas (1+2=3) we get a result in which no integer
+    power of 10 gives a correct solution.
+
+    b_shr and c_shr, as well as the exponent associated with the output vector A[] can be
+    calculated using vect_complex_s16()'s prepare function, vect_complex_s16_prepare().
+
+      void vect_complex_s16_prepare(
+          exponent_t* a_exp,
+          right_shift_t* b_shr,
+          right_shift_t* c_shr,
+          const exponent_t b_exp,
+          const exponent_t c_exp,
+          const headroom_t b_hr,
+          const headroom_t c_hr);
+
+    ASIDE: Technically vect_complex_s16_add_prepare is a macro which resolves to
+           vect_s32_add_prepare. The logic for computing the shifts and output exponent for
+           vect_complex_s16_add() and vect_s32_add() is the same. The macro is used to
+           provide a more uniform API for the user (i.e. the prepare function for operation 'X()'
+           is 'X_prepare()').
+
+    Here, a_exp is an output parameter which is the exponent associated with the ouput vector
+    A[]. b_shr and c_shr are output parameters which correspond to the b_shr and c_shr used
+    for vect_complex_s16_add(). b_exp and b_hr are the exponent and vector headroom of
+    input vector B[]. Likewise with c_exp, c_hr and C[].
+
+    Finally, the output of vect_complex_s16_add() is the headroom of the complex output
+    vector A[]. For a 16-bit complex BFP vector, the headroom is the minimum of the headroom
+    of each of the vector's complex mantissas. The headroom of a complex mantissa is the
+    minimum of the headroom of its real part and its imaginary part.
+
+  */
+
+  // This is the data we'll use for vectors A[], B[] and C[]. Note that the real[] and imag[]
+  // arrays are declared as word-aligned. This is a requirement for most vect_* functions.
+  // Also note that this data structure looks very similar to bfp_complex_s16_t, which the
+  // high-level BFP API uses for complex 16-bit vectors.
+  struct {
+    int16_t WORD_ALIGNED real[LENGTH];
+    int16_t WORD_ALIGNED imag[LENGTH];
+    exponent_t exp;
+    headroom_t hr;
+  } A, B, C;
+  // To be clear about the notation used (in the comments) here,  A[], B[] and C[] refer to the
+  // block floating-point vectors, and e.g. A[k] refers to the complex rational value
+  // ((A.real[k] + j*A.imag[k]) * 2^(A.exp)).
+
+
+  // Setting our initial exponents to -15 means that the vector elements will span the range
+  // [-1.0, 1.0), because INT16_MIN is -2^15 and INT16_MAX is (2^15)-1.
+  B.exp = -15;
+  C.exp = -15;
+
+  // Randomly set the mantissas for B[] and C[]
+  for(int k = 0; k < LENGTH; k++){
+    B.real[k] = rand();
+    B.imag[k] = rand();
+    C.real[k] = rand();
+    C.imag[k] = rand();
+  }
+
+
+  // We haven't yet initialized the headroom of vectors B[] and C[]. Because we need to know their
+  // headroom to properly call vect_complex_s16_add_prepare(), we need to initialize these.
+  // vect_complex_s16_headroom() computes the headroom for a complex 16-bit mantissa vector.
+  {
+    printf("\n\n\n============ Step 1 ============\n");
+
+    B.hr = vect_complex_s16_headroom(B.real, B.imag, LENGTH);
+    C.hr = vect_complex_s16_headroom(C.real, C.imag, LENGTH);
+
+    PRINT_VECT_C16(B);
+    PRINT_VECT_C16(C);
+
+    printf("B.hr --> %u\n", B.hr);
+    printf("C.hr --> %u\n", C.hr);
+
+  }
+
+
+  // Now that we've initialized everything, we can execute the add operation.
+  {
+    printf("\n\n\n============ Step 2 ============\n");
+
+    // First, we prepare, which tells us the output exponent A.exp and the shift values
+    // needed by vect_complex_s16_add()
+    right_shift_t b_shr, c_shr;
+    vect_complex_s16_add_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
+
+    printf("A.exp --> %d\n", A.exp);
+    printf("b_shr --> %d\n", b_shr);
+    printf("c_shr --> %d\n", c_shr);
+
+    // Then we call the operator, making sure that we capture the headroom of A[], returned by
+    // the function.
+    A.hr = vect_complex_s16_add(A.real, A.imag, B.real, B.imag,
+                                C.real, C.imag, LENGTH, b_shr, c_shr);
+
+    PRINT_VECT_C16(A);
+    printf("A.hr --> %u\n", A.hr);
+
+  }
+
+
+  // Perhaps we need out complex 16-bit output vector A[] to be in the particular fixed-point
+  // Q-format Q8.8, in which the mantissa's 8 least significant bits are fractional. We can
+  // achieve this adjusting the results of the prepare step.
+  {
+    printf("\n\n\n============ Step 3 ============\n");
+
+    // First, prepare as we did before.
+    right_shift_t b_shr, c_shr;
+    vect_complex_s16_add_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
+
+    // The exponent have been chosen to maximize the precision of the result vector
+    printf("A.exp --> %d (before adjustment)\n", A.exp);
+    printf("b_shr --> %d (before adjustment)\n", b_shr);
+    printf("c_shr --> %d (before adjustment)\n", c_shr);
+
+    // To force an output Q-format of Q8.8, we find the adjustment necessary to change the
+    // output exponent to the negative of the number of fractional bits:
+    const exponent_t required_exp = -8;
+    const int adjustment = required_exp - A.exp;
+
+    // This adjustment is used to modify the shift parameters and (for correctness) the output
+    // exponent.
+
+    //  ASIDE: If you've looked at the vect_s32_example() where we enforced an output Q-format of
+    //         Q8.24 from vect_s32_mul(), you may notice that the logic here looks different.
+    //         In particular, in that example, the adjustment was split between b_shr and c_shr,
+    //         whereas here b_shr and c_shr each receive the full adjustment. This is a genuine
+    //         difference in the logic of addition versus multiplication.  To make these sorts of
+    //         manual optimizations, *you must understand how the mantissas and exponents of the
+    //         inputs and outputs of that particular operation relate to one another*.
+    b_shr += adjustment;
+    c_shr += adjustment;
+    A.exp += adjustment;
+
+    printf("A.exp --> %d (after adjustment)\n", A.exp);
+    printf("b_shr --> %d (after adjustment)\n", b_shr);
+    printf("c_shr --> %d (after adjustment)\n", c_shr);
+
+    // The the adjustments complete, we call vect_complex_s16_add() as usual.
+    A.hr = vect_complex_s16_add(A.real, A.imag, B.real, B.imag,
+                                    C.real, C.imag, LENGTH, b_shr, c_shr);
+
+    PRINT_VECT_C16(A);
+    printf("A.hr --> %u\n", A.hr);
+  }
+
+}
--- a/lib_xcore_math/examples/app_vect_demo/src/vect_s32_example.c
+++ b/lib_xcore_math/examples/app_vect_demo/src/vect_s32_example.c
@@ -0,0 +1,299 @@
+// Copyright 2020-2024 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "xcore_math.h"
+
+
+// Prints a BFP vector both in its mantissa-exponent form and as a floating point vector. Also
+// prints headroom.
+static void print_vector_s32(
+    const int32_t vect[],
+    const char* name,
+    const exponent_t exp,
+    const unsigned length,
+    const unsigned line)
+{
+  // First, the raw mantissas
+  printf("%s = [", name);
+  for(unsigned int k = 0; k < length; k++)
+    printf("%ld, ", (long int) vect[k]);
+  printf("] * 2**(%d)\t// (line %u)\n", exp, line);
+
+  // Next, the float equivalent
+  printf("%s_float = [", name);
+  for(unsigned int k = 0; k < length; k++)
+    printf("%0.07f, ", ldexp(vect[k], exp));
+  printf("]\n");
+}
+
+#define PRINT_VECT_S32(X)     print_vector_s32(X.data, #X, X.exp, LENGTH, __LINE__)
+
+// The length (in elements) of our BFP vectors. We're using a small length to keep the text output
+// relatively tidy.
+#define LENGTH    (4)
+
+
+
+void vect_s32_example()
+{
+
+  printf("\n\n\n\n");
+  printf("========================\n");
+  printf("== vect_s32_example() ==\n");
+  printf("========================\n");
+
+  /*
+    Here we'll look at the function vect_s32_mul()
+
+    vect_s32_mul() is an element-wise real multiply for vectors of 32-bit elements. Its prototype
+    is:
+
+      headroom_t vect_s32_mul(
+        int32_t a[],
+        const int32_t b[],
+        const int32_t c[],
+        const unsigned length,
+        const right_shift_t b_shr,
+        const right_shift_t c_shr);
+
+    The first three parameters are the input and output vectors. More specifically, b[] and c[] are
+    input vectors and a[] is an output. All three vectors should be at least length elements long.
+    The vectors must also begin at word-aligned addresses (this happens automaticaly when operating
+    on an int32_t array, but may need to be explicitly specified if, for example, we're operating on
+    a char array which is being recast as an int32_t array).
+
+    b_shr and c_shr are arithmetic right-shifts applied to elements of b[] and c[] (respectively)
+    prior to multiplication. These are the levers used to simultaneously prevent
+    overflows/saturation in the result and to minimize the headroom of the result (which has the
+    effect of maximizing the precision of the result).
+
+    What values to use for b_shr and c_shr? The vect_* functions which take shift parameters as
+    arguments each have an associated helper function whose name is the same as that of the operator
+    function with "_prepare" at the end. So, for vect_s32_mul() the corresponding prepare function
+    is vect_s32_mul_prepare().
+
+      void vect_s32_mul_prepare(
+        exponent_t* a_exp,
+        right_shift_t* b_shr,
+        right_shift_t* c_shr,
+        const exponent_t b_exp,
+        const exponent_t c_exp,
+        const headroom_t b_hr,
+        const headroom_t c_hr);
+
+    Most of the prepare functions look similar to this. Here a_exp, b_shr and c_shr are the
+    parameters to be output by this function. b_shr and c_shr are the shift parameters to be passed
+    on to vect_s32_mul() and a_exp is the exponent that should be associated with the product vector
+    a[]. The inputs b_exp and c_exp are the exponents associated with input vectors b[] and c[]
+    respectively.
+
+    Exponents come into play because the prepare functions assume that a block floating-point-like
+    behavior is desired. In general, the product of two 32-bit integers will not fit into a 32-bit
+    value, and so the output exponent a_exp is calculated to ensure the product does not hit the
+    data type's saturation bounds while preserving the most possible precision.
+
+    b_hr and c_hr are the headroom of vectors b[] and c[]. The headroom of a vector is the number of
+    bits that its elements may be left-shifted without losing information (i.e. saturating).
+    Equivalently, for vectors with signed elements, it is the minimum number of _redundant_ leading
+    sign bits among the individual elements. This is important because it tells us about the range
+    of values actually held in a vector of integers.
+
+    Returning to vect_s32_mul(), note that its return value is also of type headroom_t. The XS3 VPU
+    has hardware features that allow it to track the headroom of data being processed. The returned
+    value is the headroom of the output vector a[] (which may be needed for subsequent processing
+    steps).
+
+  */
+
+  // Let's start by creating 3 structs representing BFP vectors
+  struct {
+    int32_t WORD_ALIGNED data[LENGTH];
+    exponent_t exp;
+    headroom_t hr;
+  } A, B, C;
+
+  // Note that this struct is similar to the bfp_s32_t struct, except
+  //    1) it contains no length because here we're using a static length of LENGTH
+  //    2) instead of a pointer to a mantissa buffer, the buffer is part of the struct.
+
+  // Also notice that WORD_ALIGNED was used to guarantee word-alignment of data[] (even though it
+  // should not actually be necessary here because we're using int32_t).
+
+
+  // Let's fill B[] and C[] with random values
+  {
+    printf("\n\n\n============ Step 1 ============\n");
+
+    // Random mantissas
+    for(int k = 0; k < LENGTH; k++){
+      B.data[k] = rand();
+      C.data[k] = rand();
+    }
+
+    //use an exponent of -31 on each, just to keep the values tractable
+    B.exp = -31;
+    C.exp = -31;
+
+    PRINT_VECT_S32(B);
+    PRINT_VECT_S32(C);
+
+  }
+
+  // We don't yet know the headroom of B[] and C[], so we'll find that next
+  {
+    printf("\n\n\n============ Step 2 ============\n");
+
+    B.hr = vect_s32_headroom(B.data, LENGTH);
+    C.hr = vect_s32_headroom(C.data, LENGTH);
+    printf("B.hr --> %u\n", B.hr);
+    printf("C.hr --> %u\n", C.hr);
+  }
+
+  // Now we can multiply the vectors
+  {
+    printf("\n\n\n============ Step 3 ============\n");
+
+    right_shift_t b_shr;
+    right_shift_t c_shr;
+    vect_s32_mul_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
+
+    printf("A.exp --> %d  (output exponent)\n", A.exp);
+    printf("b_shr --> %d\n", b_shr);
+    printf("c_shr --> %d\n", c_shr);
+
+    A.hr = vect_s32_mul(A.data, B.data, C.data, LENGTH, b_shr, c_shr);
+
+    PRINT_VECT_S32(A);
+    printf("A.hr --> %u\n", A.hr);
+  }
+
+
+  // The code above is exactly how bfp_s32_mul() implements this behavior.
+
+  /*
+    What if this multiplication is the last step in our BFP logic, and immediately afterwards the
+    result is to be input to a function which requires the data to be in a specific Q-format? In
+    this case, if we used the logic above (or bfp_s32_mul()) we may then have to add an additional
+    step to shift the mantissas around to achieve the correct exponent. Let's suppose the data must
+    be in a Q24 format:
+
+      ... //continued from above
+      const exponent_t required_exponent = -24;
+      right_shift_t q_shr = required_exponent - A.exp;
+      vect_s32_shr(A.data, A.data, LENGTH, q_shr);
+      A.hr += q_shr;
+      A.exp += q_shr;
+
+    Alternatively, you can save CPU cycles and skip that step entirely by modifying the shift
+    parameters b_shr and c_shr.
+
+  */
+  {
+    printf("\n\n\n============ Step 4 ============\n");
+
+    right_shift_t b_shr;
+    right_shift_t c_shr;
+    vect_s32_mul_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
+
+    // This is the required exponent
+    const exponent_t required_exponent = -24;
+    // This is the number of bits the mantissas must be right-shifted to achieve the required
+    // exponents
+    const int adjustment = required_exponent - A.exp;
+
+    /*
+     Here it becomes important that you understand the particular operation under consideration.
+     This is element-wise multiplication, such that
+
+       A[k] * 2^(A.exp) = B[k] * 2^(B.exp) * C[k] * 2^(C.exp)
+                        = B[k] * C[k] * 2^(B.exp + C.exp)
+
+     The temptation at this point might be to add 'adjustment' to both b_shr and c_shr. But because
+     multiplication of exponentials (with a common base) results in adding together their exponents,
+     in order to achieve the desired output exponent, we instead need to add a _total_ of
+     'adjustment' to the _sum_ b_shr + c_shr. It can all be added to one or to the other, or it can
+     be split between them, which is what we'll do here.
+    */
+    b_shr += (adjustment / 2);
+    c_shr += (adjustment - (adjustment / 2));
+    // And just for completeness we'll make sure A[]'s exponent is also adjusted
+    A.exp += adjustment;
+
+    // The remaining steps are identical to the pure block floating-point case above.
+    A.hr = vect_s32_mul(A.data, B.data, C.data, LENGTH, b_shr, c_shr);
+
+    PRINT_VECT_S32(A);
+    // Note that the printed floating point output values are (nearly) the same as above, indicating
+    // we've done this correctly.
+    printf("A.hr --> %u\n", A.hr);
+
+    // A final note on manually specifying an output Q-format: When doing this, there is no built-in
+    // protection which prevents the user from corrupting the data through saturation or underflows.
+    // That protection can only come from a clear understanding of the assumptions being made about
+    // the input vectors (e.g. the range of possible exponents and mantissa values), and an
+    // understanding of the operator being invoked.
+  }
+
+
+
+  /*
+    Finally, in some situations we may have a priori information that the BFP operators are unable
+    to take into account.
+
+    For example, sometimes we may have a vector of unsigned integers to multiply by another 32-bit
+    vector, but the XS3 VPU does not support any unsigned multiplications. Does that mean we cannot
+    do such an operation? Hardly!
+  */
+  {
+    printf("\n\n\n============ Step 5 ============\n");
+
+    // Let's suppose that we want to multiply B[] by C[], but where C[] is a vector of unsigned
+    // integers.
+
+    // The key fact that allows this to work is that *as long as the unsigned input vector has at
+    // least 1 bit of (unsigned) headroom, it can be safely treated as a signed vector*.
+
+        // ASIDE: "signed" vs "unsigned" headroom. Signed headroom (which is almost always what we
+        //        mean by "headroom" in lib_xcore_math) is the number of _redundant_ leading sign
+        //        bits of the vector element with the fewest sign bits. The number of redundant
+        //        leading sign bits is one fewer than the number of sign bits. "unsigned" headroom,
+        //        on the other hand, is the number of leading _zeros_ of the vector element with the
+        //        fewest number of leading zeros. Unsigned integers do not have sign bits, of
+        //        course. The signed headroom of an unsigned vector is then one less than the
+        //        unsigned headroom of that vector. So, this becomes a problem only when the
+        //        unsigned vector has 0 bits of unsigned headroom (i.e. _negative_ signed headroom).
+
+    // Ideally the (unsigned) headroom of C[] would already be known or is calculated (avoiding the
+    // need for any adjustments if there's already 1 bit of (unsigned) headroom) but a simple
+    // strategy that usually works is to just manually add 1 bit of headroom to the unsigned vector
+    // as follows
+    for(int k = 0; k < LENGTH; k++)
+      C.data[k] = ((uint32_t)C.data[k]) >> 1;
+
+    C.exp += 1;
+    PRINT_VECT_S32( C );
+
+    // Now that we've guaranteed one bit of headroom, we can apply the usual logic
+    right_shift_t b_shr;
+    right_shift_t c_shr;
+    vect_s32_mul_prepare(&A.exp, &b_shr, &c_shr, B.exp, C.exp, B.hr, C.hr);
+
+    printf("A.exp --> %d\n", A.exp);
+    printf("b_shr --> %d\n", b_shr);
+    printf("c_shr --> %d\n", c_shr);
+
+    A.hr = vect_s32_mul(A.data, B.data, C.data, LENGTH, b_shr, c_shr);
+    PRINT_VECT_S32(A);
+    printf("A.hr --> %u\n", A.hr);
+
+    // As a final note on this topic, because the XS3 VPU only uses signed logic, even if it outputs
+    // a vector with all non-negative mantissas a sign bit must necessarily be present, and so this
+    // trick should, in general, never be needed with vectors that result from lib_xcore_math
+    // operators.
+  }
+
+}
--- a/lib_xcore_math/examples/deps.cmake
+++ b/lib_xcore_math/examples/deps.cmake
@@ -0,0 +1 @@
+set(APP_DEPENDENT_MODULES "lib_xcore_math")
				`@@ -0,0 +1 @@`
				`set(APP_DEPENDENT_MODULES "lib_xcore_math")`