Math: FIR: Add more efficient generic C function fir_32x16_2x()

This patch adds FIR function that computes two samples per call it reduces the coefficients and PCM data accesses. Saving from original FIR is 34 MCPS from 134 MCPS to 100 MCPS. The samples include constrain of FIR coefficients length to be multiple of four and add two into delay line length to have room for two input samples while keeping the length even. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
2022-02-16 16:41:30 +02:00 · 2022-02-16 16:41:30 +02:00 · 5eb7304f61
parent fc30b4b870
commit 5eb7304f61
2 changed files with 81 additions and 8 deletions
--- a/src/include/sof/math/fir_generic.h
+++ b/src/include/sof/math/fir_generic.h
@ -25,7 +25,7 @@ struct sof_eq_fir_coef_data;
 struct fir_state_32x16 {
 	int rwi; /* Circular read and write index */
 	int taps; /* Number of FIR taps */
-	int length; /* Number of FIR taps */
+	int length; /* Number of FIR taps plus input length (even) */
 	int out_shift; /* Amount of right shifts at output */
 	int16_t *coef; /* Pointer to FIR coefficients */
 	int32_t *delay; /* Pointer to FIR delay line */
@ -42,5 +42,7 @@ void fir_init_delay(struct fir_state_32x16 *fir, int32_t **data);

 int32_t fir_32x16(struct fir_state_32x16 *fir, int32_t x);

+void fir_32x16_2x(struct fir_state_32x16 *fir, int32_t x0, int32_t x1, int32_t *y0, int32_t *y1);
+
 #endif
 #endif /* __SOF_MATH_FIR_GENERIC_H__ */
--- a/src/math/fir_generic.c
+++ b/src/math/fir_generic.c
@ -36,21 +36,26 @@ void fir_reset(struct fir_state_32x16 *fir)

 int fir_delay_size(struct sof_fir_coef_data *config)
 {
-	/* Check for sane FIR length. The generic version does not
-	 * have other constraints.
-	 */
-	if (config->length > SOF_FIR_MAX_LENGTH || config->length < 1)
+	/* Check FIR tap count for implementation specific constraints */
+	if (config->length > SOF_FIR_MAX_LENGTH || config->length < 4)
 		return -EINVAL;

-	return config->length * sizeof(int32_t);
+	/* The optimization requires the tap count to be multiple of four */
+	if (config->length & 0x3)
+		return -EINVAL;
+
+	/* The dual sample version needs one more delay entry. To preserve
+	 * align for 64 bits need to add two.
+	 */
+	return (config->length + 4) * sizeof(int32_t);
 }

 int fir_init_coef(struct fir_state_32x16 *fir,
 		  struct sof_fir_coef_data *config)
 {
 	fir->rwi = 0;
-	fir->length = (int)config->length;
-	fir->taps = fir->length; /* The same for generic C version */
+	fir->taps = (int)config->length;
+	fir->length = (int)fir->taps + 2;
 	fir->out_shift = (int)config->out_shift;
 	fir->coef = ASSUME_ALIGNED(&config->coef[0], 4);
 	return 0;
@ -109,4 +114,70 @@ int32_t fir_32x16(struct fir_state_32x16 *fir, int32_t x)
 	return sat_int32(y >> shift);
 }

+void fir_32x16_2x(struct fir_state_32x16 *fir, int32_t x0, int32_t x1, int32_t *y0, int32_t *y1)
+{
+	int64_t a0 = 0;
+	int64_t a1 = 0;
+	int32_t sample0;
+	int32_t sample1;
+	int16_t tap;
+	int32_t *data = &fir->delay[fir->rwi];
+	int16_t *coef = &fir->coef[0];
+	int n1;
+	int n2;
+	int i;
+	const int length = fir->length;
+	const int taps = fir->taps;
+	const int shift = 15 + fir->out_shift;
+
+	/* Bypass is set with length set to zero. */
+	if (!fir->taps) {
+		*y0 = x0;
+		*y1 = x1;
+		return;
+	}
+
+	/* Write samples to delay */
+	*data = x0;
+	*(data + 1) = x1;
+
+	/* Advance write pointer and calculate into n1 max. number of taps
+	 * to process before circular wrap.
+	 */
+	n1 = fir->rwi + 1;
+	fir->rwi += 2;
+	if (fir->rwi >= length)
+		fir->rwi -= length;
+
+	/* Part 1, loop n1 times */
+	sample1 = x1;
+	n1 = MIN(n1, taps);
+	for (i = 0; i < n1; i++) {
+		tap = *coef;
+		coef++;
+		sample0 = *data;
+		data--;
+		a1 += (int64_t)tap * sample1;
+		a0 += (int64_t)tap * sample0;
+		sample1 = sample0;
+	}
+
+	/* Part 2, un-wrap data, continue n2 times */
+	n2 = taps - n1;
+	data = &fir->delay[length - 1];
+	for (i = 0; i < n2; i++) {
+		tap = *coef;
+		coef++;
+		sample0 = *data;
+		data--;
+		a1 += (int64_t)tap * sample1;
+		a0 += (int64_t)tap * sample0;
+		sample1 = sample0;
+	}
+
+	/* Q2.46 -> Q2.31, saturate to Q1.31 */
+	*y0 = sat_int32(a0 >> shift);
+	*y1 = sat_int32(a1 >> shift);
+}
+
 #endif