thesofproject · singalsu · Mar 17, 2026 · Mar 18, 2026
@@ -11,7 +11,7 @@ if(CONFIG_MATH_32BIT_FFT)
 endif()
 
 if(CONFIG_MATH_FFT_MULTI)
-  list(APPEND base_files fft_multi.c)
+  list(APPEND base_files fft_multi.c fft_multi_generic.c fft_multi_hifi3.c)
 endif()
 
 is_zephyr(zephyr)

@@ -19,13 +19,16 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 	ae_int32x2 sample;
 	ae_int32x2 sample1;
 	ae_int32x2 sample2;
+	ae_int32x2 tw;
 	ae_int32x2 *inx = (ae_int32x2 *)plan->inb32;
 	ae_int32x2 *outx = (ae_int32x2 *)plan->outb32;
-	ae_int32x2 *outtop;
-	ae_int32x2 *outbottom;
+	ae_int32x2 *top_ptr;
+	ae_int32x2 *bot_ptr;
 	uint16_t *idx = &plan->bit_reverse_idx[0];
-	int depth, top, bottom, index;
-	int i, j, k, m, n;
+	const int32_t *tw_r;
+	const int32_t *tw_i;
+	int depth, i;
+	int j, k, m, n;
 	int size = plan->size;
 	int len = plan->len;
 
@@ -35,7 +38,6 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 	if (!plan->inb32 || !plan->outb32)
 		return;
 
-	/* convert to complex conjugate for ifft */
 	/* step 1: re-arrange input in bit reverse order, and shrink the level to avoid overflow */
 	if (ifft) {
 		/* convert to complex conjugate for ifft */
@@ -54,43 +56,82 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 		}
 	}
 
-	/* step 2: loop to do FFT transform in smaller size */
-	for (depth = 1; depth <= len; ++depth) {
+	/*
+	 * Step 2a: First FFT stage (depth=1, m=2, n=1).
+	 * All butterflies use twiddle factor W^0 = 1+0j,
+	 * so the complex multiply is skipped entirely.
+	 */
+	top_ptr = outx;
+	bot_ptr = outx + 1;
+	for (k = 0; k < size; k += 2) {
+		sample1 = AE_L32X2_I(top_ptr, 0);
+		sample2 = AE_L32X2_I(bot_ptr, 0);
+		sample = AE_ADD32S(sample1, sample2);
+		AE_S32X2_I(sample, top_ptr, 0);
+		sample = AE_SUB32S(sample1, sample2);
+		AE_S32X2_I(sample, bot_ptr, 0);
+		top_ptr += 2;
+		bot_ptr += 2;
+	}
+
+	/* Step 2b: Remaining FFT stages (depth >= 2) */
+	for (depth = 2; depth <= len; ++depth) {
 		m = 1 << depth;
 		n = m >> 1;
 		i = FFT_SIZE_MAX >> depth;
 
+		top_ptr = outx;
+		bot_ptr = outx + n;
+
 		/* doing FFT transforms in size m */
 		for (k = 0; k < size; k += m) {
-			/* doing one FFT transform for size m */
-			for (j = 0; j < n; ++j) {
-				index = i * j;
-				top = k + j;
-				bottom = top + n;
+			/*
+			 * j=0: twiddle factor W^0 = 1+0j,
+			 * butterfly without complex multiply.
+			 */
+			sample1 = AE_L32X2_I(top_ptr, 0);
+			sample = AE_L32X2_I(bot_ptr, 0);
+			sample2 = AE_ADD32S(sample1, sample);
+			AE_S32X2_I(sample2, top_ptr, 0);
+			sample2 = AE_SUB32S(sample1, sample);
+			AE_S32X2_I(sample2, bot_ptr, 0);
+			top_ptr++;
+			bot_ptr++;
 
-				/* load twiddle factor to sample1 */
-				sample1 = twiddle_real_32[index];
-				sample2 = twiddle_imag_32[index];
-				sample1 = AE_SEL32_LH(sample1, sample2);
+			/* j=1..n-1: full butterfly with twiddle multiply */
+			tw_r = &twiddle_real_32[i];
+			tw_i = &twiddle_imag_32[i];
+			for (j = 1; j < n; ++j) {
+				/* load and combine twiddle factor {real, imag} */
+				sample1 = tw_r[0];
+				sample2 = tw_i[0];
+				tw = AE_SEL32_LH(sample1, sample2);
 
 				/* calculate the accumulator: twiddle * bottom */
-				sample2 = outx[bottom];
-				res = AE_MULF32S_HH(sample1, sample2);
-				AE_MULSF32S_LL(res, sample1, sample2);
-				res1 = AE_MULF32S_HL(sample1, sample2);
-				AE_MULAF32S_LH(res1, sample1, sample2);
+				sample2 = AE_L32X2_I(bot_ptr, 0);
+				res = AE_MULF32S_HH(tw, sample2);
+				AE_MULSF32S_LL(res, tw, sample2);
+				res1 = AE_MULF32S_HL(tw, sample2);
+				AE_MULAF32S_LH(res1, tw, sample2);
 				sample = AE_ROUND32X2F64SSYM(res, res1);
-				sample1 = outx[top];
+				sample1 = AE_L32X2_I(top_ptr, 0);
+
 				/* calculate the top output: top = top + accumulate */
 				sample2 = AE_ADD32S(sample1, sample);
-				outtop = outx + top;
-				AE_S32X2_I(sample2, outtop, 0);
+				AE_S32X2_I(sample2, top_ptr, 0);
 
 				/* calculate the bottom output: bottom = top - accumulate */
 				sample2 = AE_SUB32S(sample1, sample);
-				outbottom = outx + bottom;
-				AE_S32X2_I(sample2, outbottom, 0);
+				AE_S32X2_I(sample2, bot_ptr, 0);
+
+				top_ptr++;
+				bot_ptr++;
+				tw_r += i;
+				tw_i += i;
 			}
+			/* advance pointers past current group's bottom half */
+			top_ptr += n;
+			bot_ptr += n;
 		}
 	}
 

@@ -19,11 +19,6 @@ LOG_MODULE_REGISTER(math_fft_multi, CONFIG_SOF_LOG_LEVEL);
 SOF_DEFINE_REG_UUID(math_fft_multi);
 DECLARE_TR_CTX(math_fft_multi_tr, SOF_UUID(math_fft_multi_uuid), LOG_LEVEL_INFO);
 
-/* Constants for size 3 DFT */
-#define DFT3_COEFR	-1073741824	/* int32(-0.5 * 2^31) */
-#define DFT3_COEFI	1859775393	/* int32(sqrt(3) / 2 * 2^31) */
-#define DFT3_SCALE	715827883	/* int32(1/3*2^31) */
-
 struct fft_multi_plan *mod_fft_multi_plan_new(struct processing_module *mod, void *inb,
 					      void *outb, uint32_t size, int bits)
 {
@@ -143,152 +138,3 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla
 	mod_free(mod, plan->bit_reverse_idx);
 	mod_free(mod, plan);
 }
-
-void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
-{
-	const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
-	const struct icomplex32 c1 = {DFT3_COEFR,  DFT3_COEFI};
-	struct icomplex32 x[3];
-	struct icomplex32 p1, p2, sum;
-	int i;
-
-	for (i = 0; i < 3; i++) {
-		x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
-		x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
-	}
-
-	/*
-	 *      | 1   1   1 |
-	 * c =  | 1  c0  c1 | , x = [ x0 x1 x2 ]
-	 *      | 1  c1  c0 |
-	 *
-	 * y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(0)
-	 * y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(0)
-	 * y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(0)
-	 */
-
-	/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
-	icomplex32_adds(&x[0], &x[1], &sum);
-	icomplex32_adds(&x[2], &sum, &y[0]);
-
-	/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
-	icomplex32_mul(&c0, &x[1], &p1);
-	icomplex32_mul(&c1, &x[2], &p2);
-	icomplex32_adds(&p1, &p2, &sum);
-	icomplex32_adds(&x[0], &sum, &y[1]);
-
-	/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
-	icomplex32_mul(&c1, &x[1], &p1);
-	icomplex32_mul(&c0, &x[2], &p2);
-	icomplex32_adds(&p1, &p2, &sum);
-	icomplex32_adds(&x[0], &sum, &y[2]);
-}
-
-void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
-{
-	struct icomplex32 x[FFT_MULTI_COUNT_MAX];
-	struct icomplex32 y[FFT_MULTI_COUNT_MAX];
-	struct icomplex32 t, c;
-	int i, j, k, m;
-
-	/* Handle 2^N FFT */
-	if (plan->num_ffts == 1) {
-		memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
-		fft_execute_32(plan->fft_plan[0], ifft);
-		return;
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
-	FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
-	FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
-	FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
-#endif
-
-	/* convert to complex conjugate for IFFT */
-	if (ifft) {
-		for (i = 0; i < plan->total_size; i++)
-			icomplex32_conj(&plan->inb32[i]);
-	}
-
-	/* Copy input buffers */
-	k = 0;
-	for (i = 0; i < plan->fft_size; i++)
-		for (j = 0; j < plan->num_ffts; j++)
-			plan->tmp_i32[j][i] = plan->inb32[k++];
-
-	/* Clear output buffers and call individual FFTs*/
-	for (j = 0; j < plan->num_ffts; j++) {
-		memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
-		fft_execute_32(plan->fft_plan[j], 0);
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	for (j = 0; j < plan->num_ffts; j++)
-		for (i = 0; i < plan->fft_size; i++)
-			fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
-#endif
-
-	/* Multiply with twiddle factors */
-	m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
-	for (j = 1; j < plan->num_ffts; j++) {
-		for (i = 0; i < plan->fft_size; i++) {
-			c = plan->tmp_o32[j][i];
-			k = j * i * m;
-			t.real = multi_twiddle_real_32[k];
-			t.imag = multi_twiddle_imag_32[k];
-			//fprintf(fh3, "%d %d\n", t.real, t.imag);
-			icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
-		}
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	for (j = 0; j < plan->num_ffts; j++)
-		for (i = 0; i < plan->fft_size; i++)
-			fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
-#endif
-
-	/* DFT of size 3 */
-	j = plan->fft_size;
-	k = 2 * plan->fft_size;
-	for (i = 0; i < plan->fft_size; i++) {
-		x[0] = plan->tmp_o32[0][i];
-		x[1] = plan->tmp_o32[1][i];
-		x[2] = plan->tmp_o32[2][i];
-		dft3_32(x, y);
-		plan->outb32[i] = y[0];
-		plan->outb32[i + j] = y[1];
-		plan->outb32[i + k] = y[2];
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	for (i = 0; i < plan->total_size; i++)
-		fprintf(fh4, "%d %d\n",  plan->outb32[i].real, plan->outb32[i].imag);
-#endif
-
-	/* shift back for IFFT */
-
-	/* TODO: Check if time shift method for IFFT is more efficient or more accurate
-	 * tmp = 1 / N * fft(X);
-	 * x = tmp([1 N:-1:2])
-	 */
-	if (ifft) {
-		/*
-		 * no need to divide N as it is already done in the input side
-		 * for Q1.31 format. Instead, we need to multiply N to compensate
-		 * the shrink we did in the FFT transform.
-		 */
-		for (i = 0; i < plan->total_size; i++) {
-			/* Need to negate imag part to match reference */
-			plan->outb32[i].imag = -plan->outb32[i].imag;
-			icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
-					 &plan->outb32[i]);
-			plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
-			plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
-		}
-	}
-
-#ifdef DEBUG_DUMP_TO_FILE
-	fclose(fh1); fclose(fh2); fclose(fh3); fclose(fh4);
-#endif
-}