This commit is contained in:
Steven Dan
2025-12-11 09:43:42 +08:00
commit d8b2974133
1822 changed files with 280037 additions and 0 deletions

View File

@@ -0,0 +1,37 @@
cmake_minimum_required(VERSION 3.21)
include($ENV{XMOS_CMAKE_PATH}/xcommon.cmake)
project(fd_fir_test)
set(APP_HW_TARGET XK-EVK-XU316)
set(APP_DEPENDENT_MODULES
"lib_audio_dsp"
"lib_logging(3.2.0)"
"lib_locks(2.2.0)"
)
set(APP_PCA_ENABLE OFF)
set(EXAMPLE_BUILD_FLAGS ${EXTRA_BUILD_FLAGS} -fcomment-asm
-Wall
-O3
-report
-lquadflash
-mcmodel=large
-g
-fxscope)
set(APP_COMPILER_FLAGS ${EXAMPLE_BUILD_FLAGS})
file(GLOB C_SRC CONFIGURE_DEPENDS RELATIVE ${CMAKE_CURRENT_LIST_DIR} src/*.c)
set(DSP_DIR build/dsp_pipeline)
set(APP_C_SRCS
"${C_SRC};${DSP_MAIN}")
set(APP_INCLUDES
src
src/core
src/extensions
${CMAKE_CURRENT_LIST_DIR}/build/dsp_pipeline)
set(XMOS_SANDBOX_DIR ${CMAKE_CURRENT_LIST_DIR}/../../..)
XMOS_REGISTER_APP()

View File

@@ -0,0 +1,5 @@
app_td_block_fir
---
This demonstrates 16 concurrent FIRs running on a single tile with a frame size of 8 and of length 4008.
Currently, it runs at 46kHz.

View File

@@ -0,0 +1,104 @@
# Copyright 2024-2025 XMOS LIMITED.
# This Software is subject to the terms of the XMOS Public Licence: Version 1.
"""Common code for time and frequency domain block FIR generator."""
import numpy as np
import io
import os
# emit the debug filter coefs
def emit_debug_filter(fh: io.TextIOWrapper, coefs: np.ndarray, name: str):
"""
Emit a debug section describing the filter to the header.
Parameters
----------
fh : io.TextIOWrapper
File handle of the header to write to.
coefs : np.ndarray
Array of floats describing the filter.
name : str
Name of the filter.
Returns
-------
str
Name of the structure contining the deubg info.
"""
filter_length = len(coefs)
max_val = np.max(np.abs(coefs))
_, e = np.frexp(max_val)
exp = 31 - e
quantised_filter = np.array(np.rint(np.ldexp(coefs, exp)), dtype=np.int32)
quantised_filter = np.clip(quantised_filter, np.iinfo(np.int32).min, np.iinfo(np.int32).max)
v = np.where(quantised_filter > 0, np.iinfo(np.int32).max, np.iinfo(np.int32).min)
# Convert to pythons arb precision ints
max_accu = sum([a * b for a, b in zip(quantised_filter.tolist(), v.tolist())])
prod_shr = int(np.ceil(np.log2(max_accu / np.iinfo(np.int64).max)))
if prod_shr < 0:
prod_shr = 0
accu_shr = exp - prod_shr
coef_data_name = "debug_" + name + "_filter_taps"
fh.write(
"int32_t __attribute__((aligned (8))) "
+ coef_data_name
+ "["
+ str(filter_length)
+ "] = {\n"
)
counter = 1
for val in coefs:
int_val = np.int32(np.rint(np.ldexp(val, exp)))
fh.write("%12d" % (int_val))
if counter != filter_length:
fh.write(",\t")
if counter % 4 == 0:
fh.write("\n")
counter += 1
fh.write("};\n\n")
struct_name = "td_block_debug_fir_filter_" + name
fh.write('#include "ref_fir.h"\n')
fh.write("td_reference_fir_filter_t " + struct_name + " = {\n")
fh.write("\t.coefs = " + coef_data_name + ",\n")
fh.write("\t.length = " + str(filter_length) + ",\n")
fh.write("\t.exponent = " + str(-exp) + ",\n")
fh.write("\t.accu_shr = " + str(accu_shr) + ",\n")
fh.write("\t.prod_shr = " + str(prod_shr) + ",\n")
fh.write("};\n")
fh.write("\n")
return struct_name
def generate_debug_fir(
td_coefs: np.ndarray,
filter_name: str,
output_path: str,
frame_advance=None,
frame_overlap=None,
td_block_length=None,
gain_db=0.0,
verbose=False,
):
"""Convert the input array into a header to be included in a C debug tests."""
output_file_name = os.path.join(output_path, filter_name + "_debug.h")
td_coefs = np.array(td_coefs, dtype=np.float64)
with open(output_file_name, "w") as fh:
fh.write('#include "dsp/fd_block_fir.h"\n\n')
emit_debug_filter(fh, td_coefs, filter_name)
fh.write(
"#define debug_" + filter_name + "_DATA_BUFFER_ELEMENTS (" + str(len(td_coefs)) + ")\n"
)
fh.write("\n")

View File

@@ -0,0 +1,82 @@
// Copyright 2024-2025 XMOS LIMITED.
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <xcore/hwtimer.h>
#include "../autogen/dut.h"
#include "../autogen/dut_debug.h"
#include "ref_fir.h"
/*
This tests for equlivance between the FD implementation and the TD reference.
It has an allowed error of 32 for mean abs error and abs mean error.
*/
int run_test(void){
int32_t __attribute__((aligned (8))) data[dut_DATA_BUFFER_ELEMENTS];
int32_t __attribute__((aligned (8))) new_data[dut_TD_BLOCK_LENGTH];
int32_t __attribute__((aligned (8))) data_td[debug_dut_DATA_BUFFER_ELEMENTS];
memset(new_data, 0, sizeof(new_data));
memset(data_td, 0, sizeof(data_td));
memset(data, 0, sizeof(data));
fd_fir_data_t fd_fir_data_dut;
fd_block_fir_data_init(&fd_fir_data_dut, data,
dut_FRAME_ADVANCE,
dut_TD_BLOCK_LENGTH,
dut_BLOCK_COUNT);
int error_sum = 0;
int abs_error_sum = 0;
int count = 0;
int32_t frame_overlap[dut_FRAME_OVERLAP];
memset(frame_overlap, 0, sizeof(frame_overlap));
for(int j=0;j<dut_BLOCK_COUNT + 2;j++)
{
for(int i=0;i<dut_FRAME_ADVANCE;i++)
new_data[i] = rand()-rand();
int32_t td_processed[dut_FRAME_ADVANCE + dut_FRAME_OVERLAP];
memcpy(td_processed, frame_overlap, sizeof(frame_overlap));
for(int i=0;i<dut_FRAME_ADVANCE;i++)
td_processed[i+dut_FRAME_OVERLAP] = td_reference_fir(new_data[i], &td_block_debug_fir_filter_dut, data_td);
memcpy(frame_overlap, td_processed + dut_FRAME_ADVANCE, sizeof(frame_overlap));
int32_t __attribute__((aligned (8))) fd_processed[dut_TD_BLOCK_LENGTH] = {0};
fd_block_fir_add_data(new_data, &fd_fir_data_dut);
fd_block_fir_compute(
fd_processed,
&fd_fir_data_dut,
&fd_fir_filter_dut);
for(int i=0;i<dut_FRAME_ADVANCE + dut_FRAME_OVERLAP;i++){
int error = td_processed[i] - fd_processed[i];
// printf("%2d td:%12ld fd:%12ld error:%d\n", i, td_processed[i], fd_processed[i], error);
error_sum += error;
if(error < 0) error = -error;
abs_error_sum += error;
count++;
}
}
float error_ave_abs = (float)error_sum / count;
if(error_ave_abs<0)error_ave_abs=-error_ave_abs;
if (error_ave_abs > 32.0){
printf("avg error:%f avg abs error:%f dut_TD_BLOCK_LENGTH:%d dut_BLOCK_COUNT:%d DATA_BUFFER_ELEMENTS:%d\n", (float)error_sum / count, (float)abs_error_sum / count, dut_TD_BLOCK_LENGTH, dut_BLOCK_COUNT, debug_dut_DATA_BUFFER_ELEMENTS);
return 1;
}
if(((float)abs_error_sum / count) > 32.0){
printf("avg error:%f avg abs error:%f dut_TD_BLOCK_LENGTH:%d dut_BLOCK_COUNT:%d DATA_BUFFER_ELEMENTS:%d\n", (float)error_sum / count, (float)abs_error_sum / count, dut_TD_BLOCK_LENGTH, dut_BLOCK_COUNT, debug_dut_DATA_BUFFER_ELEMENTS);
return 1;
}
return 0;
}
int main() {
return run_test();
}

View File

@@ -0,0 +1,120 @@
// Copyright 2024-2025 XMOS LIMITED.
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
// Code for reference: accurate but slow
// prod_shr prevents accumulator overflow
// accu_shr returns the accumulator to the correct output q value
#include "ref_fir.h"
#include <string.h>
#include "xmath/xs3/vpu_scalar_ops.h"
int32_t td_reference_fir(
int32_t new_sample,
td_reference_fir_filter_t *filter,
int32_t *data)
{
for (uint32_t i = filter->length - 1; i > 0; i--)
data[i] = data[i - 1];
data[0] = new_sample;
int64_t accu = 0;
for (uint32_t i = 0; i < filter->length; i++)
{
int64_t p = (int64_t)data[i] * (int64_t)filter->coefs[i];
accu += ((p + (1 << (filter->prod_shr - 1))) >> filter->prod_shr);
}
int64_t res = (accu + (1 << (filter->accu_shr - 1))) >> filter->accu_shr;
if (res > INT32_MAX)
res = INT32_MAX;
if (res < INT32_MIN)
res = INT32_MIN;
return res;
}
void td_block_fir_add_data_ref(
int32_t samples_in[TD_BLOCK_FIR_LENGTH],
td_block_fir_data_t *fir_data)
{
int head;
// if this is the end of the buffer then paste it onto the front too
memcpy((void *)fir_data->data + fir_data->index, samples_in, sizeof(int32_t) * TD_BLOCK_FIR_LENGTH);
if (fir_data->index == fir_data->data_stride)
{
memcpy(fir_data->data + 0, samples_in, sizeof(int32_t) * TD_BLOCK_FIR_LENGTH);
head = 32;
}
else
{
head = fir_data->index + 32;
}
fir_data->index = head;
}
void td_block_fir_compute_ref(
int32_t output_block[TD_BLOCK_FIR_LENGTH],
td_block_fir_data_t *fir_data,
td_block_fir_filter_t *fir_filter)
{
int64_t accu[TD_BLOCK_FIR_LENGTH];
memset(accu, 0, sizeof(accu));
void *data_p = (void *)fir_data->data + fir_data->index + fir_data->data_stride - fir_filter->block_count * 32;
int second_loop_coutner = (fir_data->index - 32) / 32;
int first_loop_coutner = fir_filter->block_count - second_loop_coutner;
if (first_loop_coutner <= 0)
{
second_loop_coutner += first_loop_coutner;
first_loop_coutner = 0;
}
void *filter_p = fir_filter->coefs;
while (first_loop_coutner != 0)
{
for (int b = 0; b < TD_BLOCK_FIR_LENGTH; b++)
{
accu[TD_BLOCK_FIR_LENGTH - 1 - b] = vlmaccr32(accu[TD_BLOCK_FIR_LENGTH - 1 - b], data_p, filter_p);
data_p -= 4;
}
data_p += 64;
filter_p += 32;
first_loop_coutner--;
}
data_p -= fir_data->data_stride;
while (second_loop_coutner != 0)
{
for (int b = 0; b < TD_BLOCK_FIR_LENGTH; b++)
{
accu[TD_BLOCK_FIR_LENGTH - 1 - b] = vlmaccr32(accu[TD_BLOCK_FIR_LENGTH - 1 - b], data_p, filter_p);
data_p -= 4;
}
data_p += 64;
filter_p += 32;
second_loop_coutner--;
}
uint32_t accu_shr = fir_filter->accu_shr;
uint32_t accu_shl = fir_filter->accu_shl;
for (int i = 0; i < TD_BLOCK_FIR_LENGTH; i++)
{
int64_t t = (accu[i] + (1 << (accu_shr - 1))) >> accu_shr;
int64_t res = t << accu_shl;
if (res > INT32_MAX)
res = INT32_MAX;
if (res < INT32_MIN)
res = INT32_MIN;
output_block[i] = res;
}
}

View File

@@ -0,0 +1,60 @@
// Copyright 2024-2025 XMOS LIMITED.
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
#pragma once
#include "xmath/filter.h"
#include "dsp/td_block_fir.h"
/**
* @brief Time domain filter struct for reference.
*/
typedef struct td_reference_fir_filter_t{
/** Pointer to the actual coefficients. */
int32_t * coefs;
/** The count of coefficients. */
uint32_t length;
/** The output exponent(for printing). */
uint32_t exponent;
/** The amount to shr the accumulator after all accumulation is complete. */
uint32_t accu_shr;
/** The amount to shr the product of data and coef before accumulating. */
uint32_t prod_shr;
} td_reference_fir_filter_t;
/**
* @brief This implements a FIR at the highest possile precision in a human readable way. Its use
* is for debug and regression.
*
* @param new_sample A single sample to add to the time series data.
* @param filter Pointer to the td_reference_fir_filter_t struct.
* @param data Pointer to the actual time series data.
* @return int32_t The output of the filtered data.
*/
int32_t td_reference_fir(
int32_t new_sample,
td_reference_fir_filter_t * filter,
int32_t * data);
/**
* @brief Function to add samples to the FIR data structure. This is for debug and test only.
*
* @param input_block Array of int32_t samples of length TD_BLOCK_FIR_LENGTH.
* @param fir_data Pointer to struct of type td_block_fir_data_t to which the samples will be added.
*/
void td_block_fir_add_data_ref(
int32_t input_block[TD_BLOCK_FIR_LENGTH],
td_block_fir_data_t * fir_data);
/**
* @brief Function to compute the convolution between fir_data and fir_filter. This is for debug and test only.
*
* @param samples_out Array of length TD_BLOCK_FIR_LENGTH(8), which will be used to return the
processed samples.
* @param fir_data Pointer to struct of type td_block_fir_data_t from which the data samples will be obtained.
* @param fir_filter Pointer to struct of type td_block_fir_filter_t from which the coefficients will be obtained.
*/
void td_block_fir_compute_ref(
int32_t samples_out[TD_BLOCK_FIR_LENGTH],
td_block_fir_data_t * fir_data,
td_block_fir_filter_t * fir_filter
);

View File

@@ -0,0 +1,151 @@
# Copyright 2024-2025 XMOS LIMITED.
# This Software is subject to the terms of the XMOS Public Licence: Version 1.
import numpy as np
from pathlib import Path
import subprocess
import os
import sys
import shutil
import pytest
from scipy.signal import firwin
from audio_dsp.dsp.fd_block_fir import generate_fd_fir
from ref_fir import generate_debug_fir
import uuid
# TODO move build utils somewhere else
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../pipeline/python')))
from build_utils import build
from filelock import FileLock
build_dir_name = "build"
def build_and_run_tests(dir_name, coefficients, frame_advance = 16, td_block_length = None, frame_overlap = 0, sim = True, gain_db = 0.0):
local_build_dir_name = build_dir_name
bin_dir = Path(__file__).parent / "bin"
gen_dir = Path(__file__).parent / "autogen"
build_dir = Path(__file__).parent / local_build_dir_name
bin_dir.mkdir(exist_ok=True, parents=True)
if frame_advance is None:
frame_advance = max(td_block_length//2, 1)
# the builds share files, so can't be built in parallel, but we can run xsim in parallel after
with FileLock("build_blocker.lock"):
gen_dir.mkdir(exist_ok=True, parents=True)
# run the filter_generator on the coefs
try:
generate_fd_fir(coefficients, "dut", gen_dir, frame_advance, frame_overlap, td_block_length,
gain_db = gain_db, verbose = True)
generate_debug_fir(coefficients, "dut", gen_dir, frame_advance, frame_overlap, td_block_length,
gain_db = gain_db, verbose = True)
except ValueError as e:
if "Bad config" not in str(e):
raise e
else:
print("caught bad config")
print(str(e))
print('coef count', len(coefficients), 'frame_advance', frame_advance, 'td_block_length', td_block_length, 'frame_overlap', frame_overlap)
raise e
return
except Exception as e:
print('FAIL coef count', len(coefficients), 'frame_advance', frame_advance, 'td_block_length', td_block_length, 'frame_overlap', frame_overlap)
raise e
# build the project
build(Path(dir_name), Path(build_dir), "fd_fir_test")
unique_xe = str(bin_dir / f"{uuid.uuid4().hex[:10]}_fd_fir_test.xe")
os.rename(str(bin_dir / "fd_fir_test.xe"), unique_xe)
# Clean up
shutil.rmtree(gen_dir)
app = "xsim" if sim else "xrun --io"
run_cmd = app + " --args " + str(bin_dir / unique_xe)
proc = subprocess.run(run_cmd, cwd = dir_name, shell = True)
sig_int = proc.returncode
if sig_int == 0:
pass
else:
print('FAIL coef count', len(coefficients), 'frame_advance', frame_advance, 'td_block_length', td_block_length, 'frame_overlap', frame_overlap)
raise RuntimeError(f"xsim failed: {sig_int}")
return sig_int
dir_name = Path(__file__).parent
def test_trivial():
build_and_run_tests(dir_name, np.random.uniform(-0.125, 0.125, 34))
@pytest.mark.parametrize("td_block_length", [16])
@pytest.mark.parametrize(["filter_length_mul", "filter_length_mod"], [[1, -2],
[2, 1],
[3, 3]])
@pytest.mark.parametrize("frame_overlap,", [0, 3])
@pytest.mark.parametrize("frame_advance_mod", [-2, 0, 1])
def test_constant_value_variable_length(td_block_length, filter_length_mul, filter_length_mod, frame_overlap, frame_advance_mod):
filter_length = (td_block_length*filter_length_mul)//2 + filter_length_mod
frame_advance = td_block_length//2 + frame_advance_mod
build_and_run_tests(dir_name,
np.ones(filter_length)/filter_length,
td_block_length = None,
frame_overlap = frame_overlap,
frame_advance = frame_advance)
@pytest.mark.parametrize("length", range(15, 19, 2))
def test_random_value_variable_length(length):
build_and_run_tests(dir_name, 0.125*np.random.uniform(-1, 1, length))
@pytest.mark.parametrize("length", range(1, 18, 2))
def test_extreme_value_variable_length(length):
c = np.random.randint(0, 2, length)*2 - 1
build_and_run_tests(dir_name, c)
@pytest.mark.parametrize("length", range(2, 17, 2))
def test_all_negative_variable_length(length):
c = -np.ones(length)
build_and_run_tests(dir_name, c)
@pytest.mark.parametrize("length", range(2, 17, 2))
def test_random_pos_value_variable_length(length):
build_and_run_tests(dir_name, np.abs(np.random.uniform(-1, 1, length)))
@pytest.mark.parametrize("length", range(2, 17, 2))
def test_random_neg_value_variable_length(length):
build_and_run_tests(dir_name, np.abs(np.random.uniform(-1, 1, length)))
@pytest.mark.skip("Slow test")
@pytest.mark.parametrize("length", [1024, 4096])
def test_long_lengths(length):
build_and_run_tests(dir_name, np.random.uniform(-1, 1, length))
@pytest.mark.parametrize("length", [16, 17, 18, 32, 33, 34, 127, 128, 129])
def test_real_filter(length):
build_and_run_tests(dir_name, firwin(length, 0.5))
@pytest.mark.parametrize("length", range(2, 17, 2))
def test_main(length):
coeffs = np.abs(np.random.uniform(-1, 1, length))
coeff_name = f"tmp_coeffs_{length}.npy"
np.save(coeff_name, coeffs)
frame_advance = 4
out_folder = f"autogen_{os.environ.get('PYTEST_XDIST_WORKER')}"
os.makedirs(out_folder, exist_ok=True)
subprocess.check_output(f"python -m audio_dsp.dsp.fd_block_fir {coeff_name} {frame_advance} --output {out_folder}", shell=True)
shutil.rmtree(out_folder)
if __name__ == "__main__":
# test_constant_value_variable_length(16, 2, -2, 2, 0)
# test_long_lengths(1024)
test_trivial()