init

2025-12-11 09:43:42 +08:00
commit d8b2974133
1822 changed files with 280037 additions and 0 deletions
--- a/lib_audio_dsp/test/fd_block_fir/CMakeLists.txt
+++ b/lib_audio_dsp/test/fd_block_fir/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.21)
+include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
+project(fd_fir_test)
+
+set(APP_HW_TARGET XK-EVK-XU316)
+set(APP_DEPENDENT_MODULES
+    "lib_audio_dsp"
+    "lib_logging(3.2.0)"
+    "lib_locks(2.2.0)"
+)
+set(APP_PCA_ENABLE OFF)
+set(EXAMPLE_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} -fcomment-asm
+                                             -Wall
+                                             -O3
+                                             -report
+                                             -lquadflash
+                                             -mcmodel=large
+                                             -g
+                                             -fxscope)
+
+set(APP_COMPILER_FLAGS ${EXAMPLE_BUILD_FLAGS})
+
+file(GLOB C_SRC CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_LIST_DIR} src/*.c)
+
+set(DSP_DIR build/dsp_pipeline)
+set(APP_C_SRCS
+    "${C_SRC};${DSP_MAIN}")
+
+
+set(APP_INCLUDES
+    src
+    src/core
+    src/extensions
+    ${CMAKE_CURRENT_LIST_DIR}/build/dsp_pipeline)
+set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
+
+XMOS_REGISTER_APP()
--- a/lib_audio_dsp/test/fd_block_fir/readme.md
+++ b/lib_audio_dsp/test/fd_block_fir/readme.md
@@ -0,0 +1,5 @@
+app_td_block_fir
+---
+
+This demonstrates 16 concurrent FIRs running on a single tile with a frame size of 8 and of length 4008.
+Currently, it runs at 46kHz.
--- a/lib_audio_dsp/test/fd_block_fir/ref_fir.py
+++ b/lib_audio_dsp/test/fd_block_fir/ref_fir.py
@@ -0,0 +1,104 @@
+# Copyright 2024-2025 XMOS LIMITED.
+# This Software is subject to the terms of the XMOS Public Licence: Version 1.
+"""Common code for time and frequency domain block FIR generator."""
+
+import numpy as np
+import io
+import os
+
+# emit the debug filter coefs
+def emit_debug_filter(fh: io.TextIOWrapper, coefs: np.ndarray, name: str):
+    """
+    Emit a debug section describing the filter to the header.
+
+    Parameters
+    ----------
+    fh : io.TextIOWrapper
+        File handle of the header to write to.
+    coefs : np.ndarray
+        Array of floats describing the filter.
+    name : str
+        Name of the filter.
+
+    Returns
+    -------
+    str
+        Name of the structure contining the deubg info.
+
+    """
+    filter_length = len(coefs)
+
+    max_val = np.max(np.abs(coefs))
+    _, e = np.frexp(max_val)
+    exp = 31 - e
+
+    quantised_filter = np.array(np.rint(np.ldexp(coefs, exp)), dtype=np.int32)
+    quantised_filter = np.clip(quantised_filter, np.iinfo(np.int32).min, np.iinfo(np.int32).max)
+    v = np.where(quantised_filter > 0, np.iinfo(np.int32).max, np.iinfo(np.int32).min)
+
+    # Convert to pythons arb precision ints
+    max_accu = sum([a * b for a, b in zip(quantised_filter.tolist(), v.tolist())])
+
+    prod_shr = int(np.ceil(np.log2(max_accu / np.iinfo(np.int64).max)))
+    if prod_shr < 0:
+        prod_shr = 0
+
+    accu_shr = exp - prod_shr
+    coef_data_name = "debug_" + name + "_filter_taps"
+    fh.write(
+        "int32_t __attribute__((aligned (8))) "
+        + coef_data_name
+        + "["
+        + str(filter_length)
+        + "] = {\n"
+    )
+
+    counter = 1
+    for val in coefs:
+        int_val = np.int32(np.rint(np.ldexp(val, exp)))
+        fh.write("%12d" % (int_val))
+        if counter != filter_length:
+            fh.write(",\t")
+        if counter % 4 == 0:
+            fh.write("\n")
+        counter += 1
+    fh.write("};\n\n")
+
+    struct_name = "td_block_debug_fir_filter_" + name
+
+    fh.write('#include "ref_fir.h"\n')
+    fh.write("td_reference_fir_filter_t " + struct_name + " = {\n")
+    fh.write("\t.coefs = " + coef_data_name + ",\n")
+    fh.write("\t.length = " + str(filter_length) + ",\n")
+    fh.write("\t.exponent = " + str(-exp) + ",\n")
+    fh.write("\t.accu_shr = " + str(accu_shr) + ",\n")
+    fh.write("\t.prod_shr = " + str(prod_shr) + ",\n")
+    fh.write("};\n")
+    fh.write("\n")
+
+    return struct_name
+
+
+def generate_debug_fir(
+    td_coefs: np.ndarray,
+    filter_name: str,
+    output_path: str,
+    frame_advance=None,
+    frame_overlap=None,
+    td_block_length=None,
+    gain_db=0.0,
+    verbose=False,
+):
+    """Convert the input array into a header to be included in a C debug tests."""
+    output_file_name = os.path.join(output_path, filter_name + "_debug.h")
+    td_coefs = np.array(td_coefs, dtype=np.float64)
+
+    with open(output_file_name, "w") as fh:
+        fh.write('#include "dsp/fd_block_fir.h"\n\n')
+
+        emit_debug_filter(fh, td_coefs, filter_name)
+
+        fh.write(
+            "#define debug_" + filter_name + "_DATA_BUFFER_ELEMENTS (" + str(len(td_coefs)) + ")\n"
+        )
+        fh.write("\n")
--- a/lib_audio_dsp/test/fd_block_fir/src/main.c
+++ b/lib_audio_dsp/test/fd_block_fir/src/main.c
@@ -0,0 +1,82 @@
+// Copyright 2024-2025 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <xcore/hwtimer.h>
+#include "../autogen/dut.h"
+#include "../autogen/dut_debug.h"
+
+#include "ref_fir.h"
+
+/*
+This tests for equlivance between the FD implementation and the TD reference.
+It has an allowed error of 32 for mean abs error and abs mean error. 
+*/
+int run_test(void){
+
+    int32_t __attribute__((aligned (8))) data[dut_DATA_BUFFER_ELEMENTS];
+    int32_t __attribute__((aligned (8))) new_data[dut_TD_BLOCK_LENGTH];
+    int32_t __attribute__((aligned (8))) data_td[debug_dut_DATA_BUFFER_ELEMENTS];
+    
+    memset(new_data, 0, sizeof(new_data));
+    memset(data_td, 0, sizeof(data_td));
+    memset(data, 0, sizeof(data));
+    fd_fir_data_t fd_fir_data_dut;
+    fd_block_fir_data_init(&fd_fir_data_dut, data, 
+        dut_FRAME_ADVANCE, 
+        dut_TD_BLOCK_LENGTH, 
+        dut_BLOCK_COUNT);
+
+    int error_sum = 0;
+    int abs_error_sum = 0;
+    int count = 0;
+
+    int32_t frame_overlap[dut_FRAME_OVERLAP];
+    memset(frame_overlap, 0, sizeof(frame_overlap));
+    for(int j=0;j<dut_BLOCK_COUNT + 2;j++)
+    {
+        for(int i=0;i<dut_FRAME_ADVANCE;i++)
+            new_data[i] = rand()-rand();
+
+        int32_t td_processed[dut_FRAME_ADVANCE + dut_FRAME_OVERLAP];
+
+        memcpy(td_processed, frame_overlap, sizeof(frame_overlap));
+        for(int i=0;i<dut_FRAME_ADVANCE;i++)
+            td_processed[i+dut_FRAME_OVERLAP] = td_reference_fir(new_data[i], &td_block_debug_fir_filter_dut, data_td);
+        memcpy(frame_overlap, td_processed + dut_FRAME_ADVANCE, sizeof(frame_overlap));
+
+        int32_t __attribute__((aligned (8))) fd_processed[dut_TD_BLOCK_LENGTH] = {0};
+        fd_block_fir_add_data(new_data, &fd_fir_data_dut);
+        fd_block_fir_compute(
+            fd_processed,
+            &fd_fir_data_dut,
+            &fd_fir_filter_dut);
+
+        for(int i=0;i<dut_FRAME_ADVANCE + dut_FRAME_OVERLAP;i++){
+            int error = td_processed[i] - fd_processed[i];
+            // printf("%2d td:%12ld fd:%12ld error:%d\n", i, td_processed[i], fd_processed[i], error);
+            error_sum += error;
+            if(error < 0) error = -error;
+            abs_error_sum += error;
+            count++;
+        }
+
+    }
+    float error_ave_abs =  (float)error_sum / count;
+    if(error_ave_abs<0)error_ave_abs=-error_ave_abs;
+    if (error_ave_abs > 32.0){
+        printf("avg error:%f avg abs error:%f dut_TD_BLOCK_LENGTH:%d dut_BLOCK_COUNT:%d DATA_BUFFER_ELEMENTS:%d\n", (float)error_sum / count, (float)abs_error_sum / count, dut_TD_BLOCK_LENGTH, dut_BLOCK_COUNT, debug_dut_DATA_BUFFER_ELEMENTS);
+        return 1;
+    }
+    if(((float)abs_error_sum / count) > 32.0){
+        printf("avg error:%f avg abs error:%f dut_TD_BLOCK_LENGTH:%d dut_BLOCK_COUNT:%d DATA_BUFFER_ELEMENTS:%d\n", (float)error_sum / count, (float)abs_error_sum / count, dut_TD_BLOCK_LENGTH, dut_BLOCK_COUNT, debug_dut_DATA_BUFFER_ELEMENTS);
+        return 1;
+    }
+    return 0;
+}
+
+int main() {
+  return run_test();
+}
--- a/lib_audio_dsp/test/fd_block_fir/src/ref_fir.c
+++ b/lib_audio_dsp/test/fd_block_fir/src/ref_fir.c
@@ -0,0 +1,120 @@
+// Copyright 2024-2025 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+// Code for reference: accurate but slow
+// prod_shr prevents accumulator overflow
+// accu_shr returns the accumulator to the correct output q value
+#include "ref_fir.h"
+#include <string.h>
+#include "xmath/xs3/vpu_scalar_ops.h"
+
+
+int32_t td_reference_fir(
+    int32_t new_sample,
+    td_reference_fir_filter_t *filter,
+    int32_t *data)
+{
+
+    for (uint32_t i = filter->length - 1; i > 0; i--)
+        data[i] = data[i - 1];
+    data[0] = new_sample;
+
+    int64_t accu = 0;
+    for (uint32_t i = 0; i < filter->length; i++)
+    {
+        int64_t p = (int64_t)data[i] * (int64_t)filter->coefs[i];
+        accu += ((p + (1 << (filter->prod_shr - 1))) >> filter->prod_shr);
+    }
+
+    int64_t res = (accu + (1 << (filter->accu_shr - 1))) >> filter->accu_shr;
+    if (res > INT32_MAX)
+        res = INT32_MAX;
+    if (res < INT32_MIN)
+        res = INT32_MIN;
+    return res;
+}
+
+
+void td_block_fir_add_data_ref(
+    int32_t samples_in[TD_BLOCK_FIR_LENGTH],
+    td_block_fir_data_t *fir_data)
+{
+
+    int head;
+
+    // if this is the end of the buffer then paste it onto the front too
+    memcpy((void *)fir_data->data + fir_data->index, samples_in, sizeof(int32_t) * TD_BLOCK_FIR_LENGTH);
+
+    if (fir_data->index == fir_data->data_stride)
+    {
+        memcpy(fir_data->data + 0, samples_in, sizeof(int32_t) * TD_BLOCK_FIR_LENGTH);
+        head = 32;
+    }
+    else
+    {
+        head = fir_data->index + 32;
+    }
+
+    fir_data->index = head;
+}
+
+
+void td_block_fir_compute_ref(
+    int32_t output_block[TD_BLOCK_FIR_LENGTH],
+    td_block_fir_data_t *fir_data,
+    td_block_fir_filter_t *fir_filter)
+{
+
+    int64_t accu[TD_BLOCK_FIR_LENGTH];
+    memset(accu, 0, sizeof(accu));
+
+    void *data_p = (void *)fir_data->data + fir_data->index + fir_data->data_stride - fir_filter->block_count * 32;
+
+    int second_loop_coutner = (fir_data->index - 32) / 32;
+    int first_loop_coutner = fir_filter->block_count - second_loop_coutner;
+
+    if (first_loop_coutner <= 0)
+    {
+        second_loop_coutner += first_loop_coutner;
+        first_loop_coutner = 0;
+    }
+
+    void *filter_p = fir_filter->coefs;
+    while (first_loop_coutner != 0)
+    {
+        for (int b = 0; b < TD_BLOCK_FIR_LENGTH; b++)
+        {
+            accu[TD_BLOCK_FIR_LENGTH - 1 - b] = vlmaccr32(accu[TD_BLOCK_FIR_LENGTH - 1 - b], data_p, filter_p);
+            data_p -= 4;
+        }
+        data_p += 64;
+        filter_p += 32;
+        first_loop_coutner--;
+    }
+    data_p -= fir_data->data_stride;
+    while (second_loop_coutner != 0)
+    {
+        for (int b = 0; b < TD_BLOCK_FIR_LENGTH; b++)
+        {
+            accu[TD_BLOCK_FIR_LENGTH - 1 - b] = vlmaccr32(accu[TD_BLOCK_FIR_LENGTH - 1 - b], data_p, filter_p);
+            data_p -= 4;
+        }
+        data_p += 64;
+        filter_p += 32;
+        second_loop_coutner--;
+    }
+
+    uint32_t accu_shr = fir_filter->accu_shr;
+    uint32_t accu_shl = fir_filter->accu_shl;
+
+    for (int i = 0; i < TD_BLOCK_FIR_LENGTH; i++)
+    {
+        int64_t t = (accu[i] + (1 << (accu_shr - 1))) >> accu_shr;
+        int64_t res = t << accu_shl;
+        if (res > INT32_MAX)
+            res = INT32_MAX;
+        if (res < INT32_MIN)
+            res = INT32_MIN;
+        output_block[i] = res;
+    }
+}
--- a/lib_audio_dsp/test/fd_block_fir/src/ref_fir.h
+++ b/lib_audio_dsp/test/fd_block_fir/src/ref_fir.h
@@ -0,0 +1,60 @@
+// Copyright 2024-2025 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+
+#pragma once
+
+#include "xmath/filter.h"
+#include "dsp/td_block_fir.h"
+/**
+ * @brief Time domain filter struct for reference.
+ */
+typedef struct td_reference_fir_filter_t{
+  /** Pointer to the actual coefficients. */
+  int32_t * coefs;
+  /** The count of coefficients. */
+  uint32_t length;
+  /** The output exponent(for printing). */
+  uint32_t exponent;
+  /** The amount to shr the accumulator after all accumulation is complete. */
+  uint32_t accu_shr;
+  /** The amount to shr the product of data and coef before accumulating. */
+  uint32_t prod_shr;
+} td_reference_fir_filter_t;
+
+/**
+ * @brief This implements a FIR at the highest possile precision in a human readable way. Its use
+ * is for debug and regression.
+ * 
+ * @param new_sample A single sample to add to the time series data.
+ * @param filter Pointer to the td_reference_fir_filter_t struct.
+ * @param data Pointer to the actual time series data.
+ * @return int32_t The output of the filtered data.
+ */
+int32_t td_reference_fir(
+    int32_t new_sample,
+    td_reference_fir_filter_t * filter,
+    int32_t * data);
+
+/**
+ * @brief Function to add samples to the FIR data structure. This is for debug and test only.
+ * 
+ * @param input_block Array of int32_t samples of length TD_BLOCK_FIR_LENGTH.
+ * @param fir_data Pointer to struct of type td_block_fir_data_t to which the samples will be added.
+ */
+void td_block_fir_add_data_ref(
+    int32_t input_block[TD_BLOCK_FIR_LENGTH],
+    td_block_fir_data_t * fir_data);
+    
+/**
+ * @brief Function to compute the convolution between fir_data and fir_filter. This is for debug and test only.
+ * 
+ * @param samples_out Array of length TD_BLOCK_FIR_LENGTH(8), which will be used to return the 
+        processed samples.
+ * @param fir_data Pointer to struct of type td_block_fir_data_t from which the data samples will be obtained.
+ * @param fir_filter Pointer to struct of type td_block_fir_filter_t from which the coefficients will be obtained.
+ */
+void td_block_fir_compute_ref(
+    int32_t samples_out[TD_BLOCK_FIR_LENGTH],
+    td_block_fir_data_t * fir_data, 
+    td_block_fir_filter_t * fir_filter
+); 
--- a/lib_audio_dsp/test/fd_block_fir/test_fd_block_fir.py
+++ b/lib_audio_dsp/test/fd_block_fir/test_fd_block_fir.py
@@ -0,0 +1,151 @@
+# Copyright 2024-2025 XMOS LIMITED.
+# This Software is subject to the terms of the XMOS Public Licence: Version 1.
+import numpy as np
+from pathlib import Path
+import subprocess
+import os
+import sys
+import shutil
+import pytest
+from scipy.signal import firwin
+from audio_dsp.dsp.fd_block_fir import generate_fd_fir
+from ref_fir import generate_debug_fir
+import uuid
+
+# TODO move build utils somewhere else
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../pipeline/python')))
+from build_utils import build
+from filelock import FileLock
+
+build_dir_name = "build"
+
+
+def build_and_run_tests(dir_name, coefficients, frame_advance = 16, td_block_length = None, frame_overlap = 0, sim = True, gain_db = 0.0):
+
+    local_build_dir_name = build_dir_name
+
+    bin_dir = Path(__file__).parent / "bin"
+    gen_dir = Path(__file__).parent / "autogen"
+    build_dir = Path(__file__).parent / local_build_dir_name
+
+    bin_dir.mkdir(exist_ok=True, parents=True)
+
+    if frame_advance is None:
+        frame_advance = max(td_block_length//2, 1)
+
+    # the builds share files, so can't be built in parallel, but we can run xsim in parallel after
+    with FileLock("build_blocker.lock"):
+        gen_dir.mkdir(exist_ok=True, parents=True)
+
+        # run the filter_generator on the coefs
+        try:
+            generate_fd_fir(coefficients, "dut", gen_dir, frame_advance, frame_overlap, td_block_length, 
+                        gain_db = gain_db, verbose = True)
+            generate_debug_fir(coefficients, "dut", gen_dir, frame_advance, frame_overlap, td_block_length, 
+                        gain_db = gain_db, verbose = True)
+        except ValueError as e:
+            if "Bad config" not in str(e):
+                raise e
+            else:
+                print("caught bad config")
+                print(str(e))
+                print('coef count', len(coefficients), 'frame_advance', frame_advance, 'td_block_length', td_block_length, 'frame_overlap', frame_overlap)
+                raise e
+                return
+        except Exception as e:
+            print('FAIL coef count', len(coefficients), 'frame_advance', frame_advance, 'td_block_length', td_block_length, 'frame_overlap', frame_overlap)
+            raise e
+
+        # build the project
+        build(Path(dir_name), Path(build_dir), "fd_fir_test")
+
+        unique_xe = str(bin_dir / f"{uuid.uuid4().hex[:10]}_fd_fir_test.xe")
+        os.rename(str(bin_dir / "fd_fir_test.xe"), unique_xe)
+
+        # Clean up
+        shutil.rmtree(gen_dir) 
+
+    app = "xsim" if sim else "xrun --io"
+    run_cmd = app + " --args " + str(bin_dir / unique_xe) 
+    
+    proc = subprocess.run(run_cmd,  cwd = dir_name, shell = True)
+
+    sig_int = proc.returncode
+
+    if sig_int == 0:
+        pass
+    else:
+        print('FAIL coef count', len(coefficients), 'frame_advance', frame_advance, 'td_block_length', td_block_length, 'frame_overlap', frame_overlap)
+        raise RuntimeError(f"xsim failed: {sig_int}")
+
+    return sig_int
+
+dir_name = Path(__file__).parent
+
+def test_trivial():
+    build_and_run_tests(dir_name, np.random.uniform(-0.125, 0.125, 34))
+
+@pytest.mark.parametrize("td_block_length", [16])
+@pytest.mark.parametrize(["filter_length_mul", "filter_length_mod"], [[1, -2],
+                                                                      [2, 1],
+                                                                      [3, 3]])
+@pytest.mark.parametrize("frame_overlap,", [0, 3])
+@pytest.mark.parametrize("frame_advance_mod", [-2, 0, 1])
+def test_constant_value_variable_length(td_block_length, filter_length_mul, filter_length_mod, frame_overlap, frame_advance_mod):
+    filter_length = (td_block_length*filter_length_mul)//2 + filter_length_mod
+    frame_advance = td_block_length//2 + frame_advance_mod
+    build_and_run_tests(dir_name, 
+                        np.ones(filter_length)/filter_length, 
+                        td_block_length = None, 
+                        frame_overlap = frame_overlap,
+                        frame_advance = frame_advance)
+
+@pytest.mark.parametrize("length", range(15, 19, 2))
+def test_random_value_variable_length(length):
+    build_and_run_tests(dir_name, 0.125*np.random.uniform(-1, 1, length))
+
+@pytest.mark.parametrize("length", range(1, 18, 2))
+def test_extreme_value_variable_length(length):
+    c = np.random.randint(0, 2, length)*2 - 1
+    build_and_run_tests(dir_name, c)
+
+@pytest.mark.parametrize("length", range(2, 17, 2))
+def test_all_negative_variable_length(length):
+    c = -np.ones(length)
+    build_and_run_tests(dir_name, c)
+
+@pytest.mark.parametrize("length", range(2, 17, 2))
+def test_random_pos_value_variable_length(length):
+    build_and_run_tests(dir_name, np.abs(np.random.uniform(-1, 1, length)))
+
+@pytest.mark.parametrize("length", range(2, 17, 2))
+def test_random_neg_value_variable_length(length):
+    build_and_run_tests(dir_name, np.abs(np.random.uniform(-1, 1, length)))
+
+@pytest.mark.skip("Slow test")
+@pytest.mark.parametrize("length", [1024, 4096])
+def test_long_lengths(length):
+    build_and_run_tests(dir_name, np.random.uniform(-1, 1, length))
+
+@pytest.mark.parametrize("length", [16, 17, 18, 32, 33, 34, 127, 128, 129])
+def test_real_filter(length):
+    build_and_run_tests(dir_name, firwin(length, 0.5))
+
+@pytest.mark.parametrize("length", range(2, 17, 2))
+def test_main(length):
+    coeffs = np.abs(np.random.uniform(-1, 1, length))
+    coeff_name = f"tmp_coeffs_{length}.npy"
+    np.save(coeff_name, coeffs)
+    frame_advance = 4
+
+    out_folder = f"autogen_{os.environ.get('PYTEST_XDIST_WORKER')}"
+    os.makedirs(out_folder, exist_ok=True)
+
+    subprocess.check_output(f"python -m audio_dsp.dsp.fd_block_fir {coeff_name} {frame_advance} --output {out_folder}", shell=True)
+
+    shutil.rmtree(out_folder)
+
+if __name__ == "__main__":
+    # test_constant_value_variable_length(16, 2, -2, 2, 0)
+    # test_long_lengths(1024)
+    test_trivial()