離散傅里葉變換 - 快速計算方法及C實現 - 第三篇

DFT – Fast algorithms and C implementations - Part3


Radix-3 DFT

我在第一篇博客裏分析了Radix-2 DFT的計算原理,並據此寫出了Radix-2 DFT的代碼,並驗證了其有效性。現在我們有信心認爲,只要有了公式,就能寫出正確的程序,產生正確的結果!

下面我們推導一下Radix-3 DFT的公式,其實非常簡單:

設N長序列\lbrace x_j \rbrace_{j<N}的DFT爲\lbrace X_k \rbrace_{k<N},且N能被3整除,則:

\begin{align} X_k &=\sum_{j<N}{x_jW_N^{jk}} \nonumber \\ &=\sum_{j<N/3}{x_{3j}W_N^{3jk}}+\sum_{j<N/3}{x_{3j+1}W_N^{(3j+1)k}} +\sum_{j<N/3}{x_{3j+2}W_N^{(3j+2)k}}\nonumber \\ &= \sum{x_{3j}W_{N/3}^{jk}} + W_N^k \sum{x_{3j+1}W_{N/3}^{jk}} + W_N^{2k} \sum{x_{3j+2}W_{N/3}^{jk}} \nonumber \end{align}

\begin{align} X_{k+N/3} &=\sum_{j<N/3}{x_{3j}W_N^{3j(k+N/3)}}+\sum_{j<N/3}{x_{3j+1}W_N^{(3j+1)(k+N/3)}} +\sum_{j<N/3}{x_{3j+2}W_N^{(3j+2)(k+N/3)}}\nonumber \\ &= \sum{x_{3j}W_{N/3}^{jk}} + W_3^1W_N^k \sum{x_{3j+1}W_{N/3}^{jk}} + W_3^2 W_N^{2k} \sum{x_{3j+2}W_{N/3}^{jk}} \nonumber \end{align}

\begin{align} X_{k+2N/3} &=\sum_{j<N/3}{x_{3j}W_N^{3j(k+2N/3)}}+\sum_{j<N/3}{x_{3j+1}W_N^{(3j+1)(k+2N/3)}} +\sum_{j<N/3}{x_{3j+2}W_N^{(3j+2)(k+2N/3)}}\nonumber \\ &= \sum{x_{3j}W_{N/3}^{jk}} + W_3^2W_N^k \sum{x_{3j+1}W_{N/3}^{jk}} + W_3^4 W_N^{2k} \sum{x_{3j+2}W_{N/3}^{jk}} \nonumber \\ &= \sum{x_{3j}W_{N/3}^{jk}} + W_3^2W_N^k \sum{x_{3j+1}W_{N/3}^{jk}} + W_3^1 W_N^{2k} \sum{x_{3j+2}W_{N/3}^{jk}} \nonumber \end{align}

所以N長序列的DFT被分解成3個N/3長序列的DFT的線性組合。

Mixed Radix-2 & Radix-3 DFT

現在我們考慮一個更加複雜的問題,就是當需要交叉進行Radix-2與Radix-3變換時該怎麼做。

比如當N=12時,顯然它既不是2的指數倍,也不是3的指數倍,但是我們可以基於Radix-2和Radix-3思想對它進行分解:

上圖顯示的是先對原序列進行Radix-2分解,再對兩個子序列分別使用Radix-3分解。

上圖顯示的是先對原序列進行Radix-3分解,再對3個子序列分別使用Radix-2分解。

顯然,兩種分解方式都可以將一個N=12的序列分解成最小長序列(N=1),從而我們可以通過反覆運用Radix-2變換和Radix-3變換來求解出原序列的DFT變換。

上述DFT分解過程看起來簡單,但如果我們希望像之前一樣先對輸入序列進行合理排序,然後循環調用Raidx-based DFT算法來獲取結果,我們先要解決一個問題:怎樣對數據進行重排序?

我在網上找了半天,並沒有找到一種類似bit-reversal-ordering這樣的通用算法,所以我只能回到最笨的那個方法,就是按照圖解的思路一步步推演重排序列。

下面貼出混合基-2和基-3離散傅里葉變換的FFT代碼,要求輸入序列的長度必須能表達成N=2^m3^n這樣的形式。


/* yufft.c
* This C source file implements the mixed radix-2 & radix-3 DFT algorithm for 1D complex-to-complex forward Fourier transform.
* Written by: [email protected], Aug.15, 2019.
*/
#include <stdio.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>

typedef struct { float re, im; } ComplexFloat;

#define YU_COMPLEX_MUL(A, B, C) \
C.re = A.re * B.re - A.im * B.im; \
C.im = A.re * B.im + A.im * B.re
#define YU_COMPLEX_ADD(A, B, C) \
C.re = A.re + B.re; C.im = A.im + B.im
#define YU_COMPLEX_SUB(A, B, C) \
C.re = A.re - B.re; C.im = A.im - B.im
#define YU_COMPLEX_CONJ(A, B) \
B.re = A.re; B.im = -A.im

#define YU_PI 3.1415926535897932384626433832795

int yuRadix2DFT(ComplexFloat *dst, const ComplexFloat *coef, int len, int folds, int N2)
{
	/*N is current dft length; N2 is half of N; 
	* stride is len/N; len2 is N2*stride; */
	int N = N2 * 2, stride = len / N, len2 = len / 2;
	int k, j, i;
	ComplexFloat *p, v;
	for(k = 0; k < folds; k++) {
		p = dst;
		for(i = 0; i < stride; i++) {
			/*radix-2 transform (loop N2 times)*/
			for(j = 0; j < len2; j += stride, p++) {
				YU_COMPLEX_MUL(p[N2], coef[j], v);
				YU_COMPLEX_SUB(p[0], v, p[N2]);
				YU_COMPLEX_ADD(p[0], v, p[0]);
			}
			p += N2;
		}
		N2 = N; N *= 2; stride /= 2;
	}
	return N2;
}

int yuRadix3DFT(ComplexFloat *dst, const ComplexFloat *coef, int len, int folds, int N3)
{
	/*N is current dft length; N3 is one third of N;
	* stride is len/N; len3 is N3*stride; */
	int N = N3 * 3, stride = len / N, len3 = len / 3;
	int k, j, i;
	ComplexFloat *p, v, v1, v2, u1, u2, W31, W32;
	W31 = coef[len3];
	W32 = coef[len3 * 2]; /*YU_COMPLEX_CONJ(W31, W32);*/
	for(k = 0; k < folds; k++) {
		p = dst;
		for(i = 0; i < stride; i++) {
			/*radix-3 transform (loop N3 times)*/
			for(j = 0; j < len3; j += stride, p++) {
				v = p[0];
				YU_COMPLEX_MUL(p[N3], coef[j], v1);
				YU_COMPLEX_MUL(p[N3 * 2], coef[j * 2], v2);
				/*p[0] = v + v1 + v2*/
				YU_COMPLEX_ADD(v1, v2, u1);
				YU_COMPLEX_ADD(v, u1, p[0]);
				/*p[N3] = v + v1*W31 + v2*W32*/
				YU_COMPLEX_MUL(v1, W31, u1);
				YU_COMPLEX_MUL(v2, W32, u2);
				YU_COMPLEX_ADD(u1, u2, u1);
				YU_COMPLEX_ADD(v, u1, p[N3]);
				/*p[N3*2] = v + v1*W32 + v2*W31*/
				YU_COMPLEX_MUL(v1, W32, u1);
				YU_COMPLEX_MUL(v2, W31, u2);
				YU_COMPLEX_ADD(u1, u2, u1);
				YU_COMPLEX_ADD(v, u1, p[N3 * 2]);
			}
			p += N3 * 2;
		}
		N3 = N; N *= 3; stride /= 3;
	}
	return N3;
}

/*return 0 if failed, return 1 if success*/
int yuFFT(const float *src0, float *dst0, int len)
{
	const ComplexFloat *src = (ComplexFloat*)src0;
	ComplexFloat *dst = (ComplexFloat*)dst0;
	/*Step 1. Factorize N into power of 2 and power of 3*/
	int powers2 = 0, powers3 = 0, i, j, k = len;
	while(!(k % 2)) {
		k /= 2;
		powers2++;
	}
	while(!(k % 3)) {
		k /= 3;
		powers3++;
	}
	if(k !=  1) {
		printf("Error: len cannot be factorized into 2 and 3");
		return 0;
	}
	/*Step 2. rearrange data in proper order*/
	int *order = (int*)malloc(len * sizeof(int));
	for(k = 0; k < len; k++)
		order[k] = k;
	int *tmp = (int*)malloc(len * sizeof(int));
	int N = len, ndft = 1;
	/*Step 2.1*/
	for(i = 0; i < powers3; i++) {
		for(j = 0; j < ndft; j++) {
			for(k = 0; k < N / 3; k++) {
				tmp[j * N + k] = order[j * N + k * 3];
				tmp[j * N + N / 3 + k] = order[j * N + k * 3 + 1];
				tmp[j * N + N * 2 / 3 + k] = order[j * N + k * 3 + 2];
			}
		}
		memcpy(order, tmp, len * sizeof(int));
		N /= 3; ndft *= 3;
	}
	/*Step 2.2*/
	for(i = 0; i < powers2; i++) {
		/*current dft length is N, and there are ndft arrays.
		we want to separate the odd-indexed sub-array from the even-indexed sub-array*/
		for(j = 0; j < ndft; j++) {
			for(k = 0; k < N / 2; k++) {
				tmp[j * N + k] = order[j * N + k * 2];
				tmp[j * N + N / 2 + k] = order[j * N + k * 2 + 1];
			}
		}
		memcpy(order, tmp, len * sizeof(int));
		N /= 2; ndft *= 2;
	}
	/*dst = src[order]*/
	for(k = 0; k < len; k++)
		dst[k] = src[order[k]];
	/*Step 3. prepare dft coefficients*/
	ComplexFloat *coef = (ComplexFloat*)malloc(len * sizeof(ComplexFloat));
	double theta;
	for(k = 0; k < len; k++) {
		theta = -2.0 * YU_PI * k / len;
		coef[k].re = (float)cos(theta);
		coef[k].im = (float)sin(theta);
	}
	/*Step 4. do radix dft*/
	N = 1;
	/*Step 4.1*/
	N = yuRadix2DFT(dst, coef, len, powers2, N);
	/*Step 4.2*/
	N = yuRadix3DFT(dst, coef, len, powers3, N);

	free(order);
	free(tmp);
	free(coef);
	return 1;
}

#undef YU_COMPLEX_MUL
#undef YU_COMPLEX_ADD
#undef YU_COMPLEX_SUB
#undef YU_PI

在寫程序的過程中,我先基於上一篇博客的代碼,把Radix-2部分單獨拎出來,封裝成一個函數。然後模仿它寫了Radix-3。在主函數yuFFT裏面,我先把N分解成2的指數與3的指數的乘積,然後兩個獨立的循環分別調用Radix-2和Radix-3算法即可。

在yuFFT函數中,Step 2部分是推演重排序方法的,完全模擬了圖解的過程,可能比較笨,但是比較好理解。並且我們可以提前計算好傅里葉係數以及重排序下標索引,建成一個表格,然後在實時運用中直接調用即可,而不是像上面的代碼中那樣,每次都從頭算一遍。

需要指出的是,雖然上述代碼裏是先調用Radix-2再調用Radix-3,但實際上我們也可以反過來做:先調用Radix-3再調用Radix-2,理論上結果不應發生變化。實際測試情況也是如此:將Step 2.1和Step 2.2調換順序,相應地把Step 4.1和Step 4.2也調換順序,最終結果不發生改變。

 

驗證程序

爲了測試程序是否正確,我寫了如下main函數,在隨機生成的序列上計算dft,然後將結果與fftw的計算結果進行比較。


/* main.c
* This C source file tests the DFT function and compares it with FFTW.
* Written by: [email protected], Aug.15, 2019.
*/
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <time.h>
#include "fftw3.h"

int yuFFT(const float *src, float *dst, int len);

int main()
{
	const int maxLength = 1024, nTrials = 3;

	/*Allocate memory*/
	float * const src = (float*)malloc(maxLength * 2 * sizeof(float));
	float * const dst = (float*)malloc(maxLength * 2 * sizeof(float));
	fftw_complex * const dsrc = (fftw_complex*)fftw_malloc(maxLength * sizeof(fftw_complex));
	fftw_complex * const ddst = (fftw_complex*)fftw_malloc(maxLength * sizeof(fftw_complex));

	srand((unsigned int)time(0));
	for(int len = 1; len <= maxLength; len++) {
		/*Test if len can be factorized into power of 2 and power of 3*/
		int k = len;
		while(!(k % 2)) k /= 2;
		while(!(k % 3)) k /= 3;
		if(k != 1)
			continue;

		/*Build fftw_plan*/
		int n[1] = { len };
		fftw_plan plan = fftw_plan_many_dft(1, n, 1, dsrc, 0, 1, 1, ddst, 0, 1, 1, FFTW_FORWARD, FFTW_ESTIMATE);

		for(int trial = 0; trial < nTrials; trial++) {
			/*Generate randoms between 0~1*/
			int *p = (int*)src, m = 0;
			for(k = 0; k < len * 2; k++) {
				p[k] = rand();
				if(p[k] > m)
					m = p[k];
			}
			if(!m)
				continue;
			float scale = 1.f / m;
			for(k = 0; k < len * 2; k++)
				src[k] = p[k] * scale;
			/*Apply yufft*/
			yuFFT(src, dst, (unsigned int)len);
			/*Apply fftw to the same data (using double precision)*/
			double *src1 = (double*)dsrc, *dst1 = (double*)ddst;
			for(k = 0; k < len * 2; k++)
				src1[k] = (double)src[k];
			fftw_execute(plan);
			/*Compute the difference*/
			double re, im, dif = 0;
			for(k = 0; k < len * 2;) {
				re = dst1[k] - dst[k]; k++;
				im = dst1[k] - dst[k]; k++;
				re = re * re + im * im;
				if(dif < re)
					dif = re;
			}
			dif = sqrt(dif);
			printf("len = %4d, trial = %2d, dif = %g\n", len, trial, dif);
		}
		fftw_destroy_plan(plan);
	}

	free(src);
	free(dst);
	fftw_free(dsrc);
	fftw_free(ddst);
	getchar();
	return 0;
}

程序在我電腦上運行後在屏幕上打印了以下信息:

len =    1, trial =  0, dif = 0
len =    1, trial =  1, dif = 0
len =    1, trial =  2, dif = 0
len =    2, trial =  0, dif = 2.26601e-08
len =    2, trial =  1, dif = 5.96046e-08
len =    2, trial =  2, dif = 0
len =    3, trial =  0, dif = 4.21468e-08
len =    3, trial =  1, dif = 9.06402e-08
len =    3, trial =  2, dif = 1.78969e-07
len =    4, trial =  0, dif = 8.2041e-08
len =    4, trial =  1, dif = 7.45058e-08
len =    4, trial =  2, dif = 4.01226e-08
len =    6, trial =  0, dif = 9.87202e-08
len =    6, trial =  1, dif = 1.06676e-07
len =    6, trial =  2, dif = 8.20255e-08
len =    8, trial =  0, dif = 2.40105e-07
len =    8, trial =  1, dif = 1.19209e-07
len =    8, trial =  2, dif = 1.77098e-07
len =    9, trial =  0, dif = 3.80564e-07
len =    9, trial =  1, dif = 2.9578e-07
len =    9, trial =  2, dif = 2.82399e-07
len =   12, trial =  0, dif = 3.38711e-07
len =   12, trial =  1, dif = 5.27257e-07
len =   12, trial =  2, dif = 2.74954e-07
len =   16, trial =  0, dif = 6.43344e-07
len =   16, trial =  1, dif = 3.53334e-07
len =   16, trial =  2, dif = 4.21732e-07
len =   18, trial =  0, dif = 3.48184e-07
len =   18, trial =  1, dif = 3.94143e-07
len =   18, trial =  2, dif = 6.19363e-07
len =   24, trial =  0, dif = 8.69895e-07
len =   24, trial =  1, dif = 6.437e-07
len =   24, trial =  2, dif = 4.47097e-07
len =   27, trial =  0, dif = 5.06819e-07
len =   27, trial =  1, dif = 6.37192e-07
len =   27, trial =  2, dif = 8.66024e-07
len =   32, trial =  0, dif = 1.6421e-06
len =   32, trial =  1, dif = 1.01241e-06
len =   32, trial =  2, dif = 1.01715e-06
len =   36, trial =  0, dif = 1.54931e-06
len =   36, trial =  1, dif = 1.37023e-06
len =   36, trial =  2, dif = 1.24578e-06
len =   48, trial =  0, dif = 1.45634e-06
len =   48, trial =  1, dif = 1.60746e-06
len =   48, trial =  2, dif = 1.20504e-06
len =   54, trial =  0, dif = 1.01185e-06
len =   54, trial =  1, dif = 1.26829e-06
len =   54, trial =  2, dif = 2.09262e-06
len =   64, trial =  0, dif = 2.38896e-06
len =   64, trial =  1, dif = 2.07697e-06
len =   64, trial =  2, dif = 9.96666e-07
len =   72, trial =  0, dif = 2.56995e-06
len =   72, trial =  1, dif = 2.7394e-06
len =   72, trial =  2, dif = 1.41041e-06
len =   81, trial =  0, dif = 2.33831e-06
len =   81, trial =  1, dif = 2.76485e-06
len =   81, trial =  2, dif = 1.91473e-06
len =   96, trial =  0, dif = 2.72731e-06
len =   96, trial =  1, dif = 2.64419e-06
len =   96, trial =  2, dif = 2.89764e-06
len =  108, trial =  0, dif = 3.06442e-06
len =  108, trial =  1, dif = 2.36151e-06
len =  108, trial =  2, dif = 2.38387e-06
len =  128, trial =  0, dif = 1.84808e-06
len =  128, trial =  1, dif = 6.65461e-06
len =  128, trial =  2, dif = 1.84561e-06
len =  144, trial =  0, dif = 3.87558e-06
len =  144, trial =  1, dif = 3.0805e-06
len =  144, trial =  2, dif = 7.03167e-06
len =  162, trial =  0, dif = 8.56079e-06
len =  162, trial =  1, dif = 7.01387e-06
len =  162, trial =  2, dif = 4.59154e-06
len =  192, trial =  0, dif = 3.47296e-06
len =  192, trial =  1, dif = 7.76133e-06
len =  192, trial =  2, dif = 6.44599e-06
len =  216, trial =  0, dif = 5.48365e-06
len =  216, trial =  1, dif = 8.35908e-06
len =  216, trial =  2, dif = 8.87525e-06
len =  243, trial =  0, dif = 5.9614e-06
len =  243, trial =  1, dif = 5.46705e-06
len =  243, trial =  2, dif = 8.96566e-06
len =  256, trial =  0, dif = 1.15965e-05
len =  256, trial =  1, dif = 6.85758e-06
len =  256, trial =  2, dif = 5.0958e-06
len =  288, trial =  0, dif = 1.24003e-05
len =  288, trial =  1, dif = 8.42293e-06
len =  288, trial =  2, dif = 1.08131e-05
len =  324, trial =  0, dif = 6.82868e-06
len =  324, trial =  1, dif = 1.09608e-05
len =  324, trial =  2, dif = 1.11663e-05
len =  384, trial =  0, dif = 6.61297e-06
len =  384, trial =  1, dif = 2.31987e-05
len =  384, trial =  2, dif = 9.79988e-06
len =  432, trial =  0, dif = 1.82128e-05
len =  432, trial =  1, dif = 2.16808e-05
len =  432, trial =  2, dif = 1.35217e-05
len =  486, trial =  0, dif = 1.19002e-05
len =  486, trial =  1, dif = 9.25769e-06
len =  486, trial =  2, dif = 1.14542e-05
len =  512, trial =  0, dif = 7.7354e-06
len =  512, trial =  1, dif = 8.70772e-06
len =  512, trial =  2, dif = 1.67701e-05
len =  576, trial =  0, dif = 2.12049e-05
len =  576, trial =  1, dif = 1.80019e-05
len =  576, trial =  2, dif = 1.81657e-05
len =  648, trial =  0, dif = 2.52407e-05
len =  648, trial =  1, dif = 1.60892e-05
len =  648, trial =  2, dif = 2.56206e-05
len =  729, trial =  0, dif = 1.24652e-05
len =  729, trial =  1, dif = 2.3825e-05
len =  729, trial =  2, dif = 1.2219e-05
len =  768, trial =  0, dif = 2.42783e-05
len =  768, trial =  1, dif = 1.72344e-05
len =  768, trial =  2, dif = 2.45952e-05
len =  864, trial =  0, dif = 3.24563e-05
len =  864, trial =  1, dif = 2.2366e-05
len =  864, trial =  2, dif = 2.00084e-05
len =  972, trial =  0, dif = 3.12402e-05
len =  972, trial =  1, dif = 2.28148e-05
len =  972, trial =  2, dif = 3.02124e-05
len = 1024, trial =  0, dif = 4.21749e-05
len = 1024, trial =  1, dif = 2.85143e-05
len = 1024, trial =  2, dif = 3.80336e-05

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章