3.6.4. filters.cpp
#
3.6.4.1. Implementation#
Multi-rate direct overlap-save filter class:
FilterOverlapSave
Polyphase overlap-save filter class:
FilterPolyphase
Header file:
filters.hpp
3.6.4.2. Source code#
1// University of Florida EEL6528
2// Tan F. Wong
3// Feb 2, 2021
4
5#include "filters.hpp"
6
7// OverlapSave filter class
8// Set up FFT objects and filter's frequency response
9void FilterOverlapSave::setup(int x_len, int h_len, std::complex<float>* h, int nt)
10{
11 // Set up sizes for the overlap-save algorithm
12 nx = x_len;
13 Unx = nx*U;
14 if (Unx%D != 0)
15 printf("[WARNING] For proper continuous filtering, input_length*U must be divisible by D!\n");
16 L = h_len;
17 double LL = (L<32) ? 32.0 : double(L); // If L too small, use 32
18 // Choose FFT size to be 2^m, where m is smallest power that 2^m >= 8*L
19 fftsize = int(pow(2.0, ceil(log2(LL)+3)));
20
21 M = fftsize-L+1;
22 // Work out the number of FFT blocks needed
23 nblocks = int(ceil(double(Unx)/M));
24 // set number of threads to use
25 nthreads = nt;
26
27 // Set up forward and inverse FFT objects
28 fwdfft = new fft(nthreads, fftsize, nblocks, M, fftsize, false);
29 invfft = new fft(nthreads, fftsize, nblocks, fftsize, fftsize, true, fwdfft->get_out());
30
31 // Calculate and save FFT of h
32
33 fft fft1time(1, fftsize, 1, 0, 0, false);
34 std::complex<float>* a = fft1time.get_in();
35 for (int i=0; i<L; i++) {
36 a[i] = h[i];
37 }
38 for (int i=L; i<fftsize; i++) {
39 a[i] = 0.0;
40 }
41 fft1time.calculate();
42 H = (std::complex<float>*) fftwf_malloc(fftsize*sizeof(std::complex<float>));
43 std::copy(fft1time.get_out(), fft1time.get_out()+fftsize, H);
44
45 // Set whole fwd->in to 0
46 std::fill(fwdfft->get_in(), fwdfft->get_in()+nblocks*fftsize, 0.0);
47}
48
49// Set head for next call in continuous filtering
50// For continuous filtering U*(length of input) must be divisible by D
51void FilterOverlapSave::set_head(bool reset) {
52 std::complex<float>* in = fwdfft->get_in();
53 if (reset)
54 std::fill(in, in+L-1, 0.0);
55 else
56 std::copy(in+Unx, in+Unx+L-1, in);
57}
58
59// Single-rate FilterOverlapSave class constructor (U=D=1)
60FilterOverlapSave::FilterOverlapSave(int x_len, int h_len, std::complex<float>* h, int nthreads)
61{
62 // Set up upsampling and downsampling rates to 1
63 U = 1;
64 D = 1;
65 // Set up FFTs and filter's freq response
66 this->setup(x_len, h_len, h, nthreads);
67}
68
69// Multi-rate FilterOverlapSave class constructor
70FilterOverlapSave::FilterOverlapSave(int up, int down, int x_len, int h_len, std::complex<float>* h, int nthreads)
71{
72 // Set up upsampling and downsampling rates to specified values
73 U = up;
74 D = down;
75 // Set up FFTs and filter's freq response
76 this->setup(x_len, h_len, h, nthreads);
77}
78
79// Class destructor
80FilterOverlapSave::~FilterOverlapSave(void) {
81 fftwf_free(H);
82 delete fwdfft;
83 delete invfft;
84}
85
86// Calculate output length
87// N.B.: The implementation of the filter method below outputs
88// floor(U*len(input)/D) samples. The transient in the beginning is not
89// included in the output. This is done so that the filter method
90// may be used to filter a continuous stream of samples.
91int FilterOverlapSave::out_len(void) {
92 return int(floor(double(Unx)/D));
93}
94
95// Filter the input in and obtain output out
96int FilterOverlapSave::filter(std::complex<float>* in, std::complex<float>* out)
97{
98 // Upsampling in by U
99 std::complex<float>* up_in = fwdfft->get_in();
100 if (U==1) {
101 std::copy(in, in+nx, up_in+L-1);
102 } else {
103 int idx = L-1;
104 for (int i=0; i<nx; i++) {
105 up_in[idx] = in[i];
106 idx += U;
107 }
108 }
109
110 // Implement overlap-save
111 fwdfft->calculate();
112 std::complex<float>* transformed = fwdfft->get_out();
113 for (int i=0; i<nblocks; i++) {
114 int idx = i*fftsize;
115#ifdef USE_VOLK
116 volk_32fc_x2_multiply_32fc(transformed+idx, transformed+idx, H, fftsize);
117#else
118 for (int j=0; j<fftsize; j++) {
119 transformed[idx++] *= H[j];
120 }
121#endif
122 }
123 invfft->calculate();
124
125 std::complex<float>* y = invfft->get_out();
126 // Discard overlap output corrupted by circular conv.
127 int idx = L-1;
128 int oidx = 0;
129 std::complex<float>* outbuf = transformed;
130 if (D==1) outbuf = out; // put to output array directly
131 for (int i=0; i<nblocks-1; i++) {
132 std::copy(y+idx, y+idx+M, outbuf+oidx);
133 idx += fftsize;
134 oidx += M;
135 }
136 // Last block
137 int nlast = Unx-(nblocks-1)*M;
138 std::copy(y+idx, y+idx+nlast, outbuf+oidx);
139 int nout = this->out_len();
140 if (D>1) {
141 // Downsampling filtered by D
142 for (int i=0; i<nout; i++) {
143 out[i] = outbuf[i*D];
144 }
145 }
146 return nout;
147}
148
149// Polyphase filter class
150// class constructor
151FilterPolyphase::FilterPolyphase(int up, int down, int x_len, int h_len, std::complex<float>* h, int nt) {
152 // Set up upsampling and downsampling rates to specified values
153 U = up;
154 D = down;
155
156 nx = x_len; // length of input signal
157 if (nx%D != 0)
158 printf("[WARNING] For proper continuous filtering, input_length must be divisible by D!\n");
159 nxD = int(floor(double(nx)/D))+1; // length of down-sampled input signals
160 // Set up lengths for the polyphase overlap-save algorithm
161 L = h_len; // Length of orginal filter in up-sampled domain
162 Lp = int(ceil(double(L)/U/D))+1; // length of polyphase filters
163 double LL = (Lp<32) ? 32.0 : double(Lp); // If L too small, use 32
164 // Choose FFT size to be 2^m, where m is smallest power that 2^m >= 8*L
165 fftsize = int(pow(2.0, ceil(log2(LL)+2)));
166 M = fftsize-Lp+1; //Length of data block for each polyphase filter
167
168 // Work out the number of FFT blocks needed per down-sampled branch
169 nblocks = int(ceil(double(nxD)/M));
170 // set number of threads to use
171 nthreads = nt;
172
173 // Set up forward and inverse FFT objects
174 fwdfft = new fft(nthreads, fftsize, nblocks, M, fftsize, D, false);
175 invfft = new fft(nthreads, fftsize, nblocks, fftsize, fftsize, U, true);
176
177 // Calculate and save FFTs of polyphase filters' impulse responses
178 F = (std::complex<float>*) fftwf_malloc(U*D*fftsize*sizeof(std::complex<float>));
179 // Append zeros to the beginning of h to get hh for convenience
180 // Also hh[n] = h[n-U*D]
181 std::complex<float> hh[fftsize*U*D];
182 std::copy(h, h+L, hh+U*D);
183 fft fft1time(1, fftsize, 1, 0, 0, false);
184 std::complex<float>* f = fft1time.get_in();
185 // Note that f[n] = f_{pq}[n-1] below
186 for (int p=0; p<U; p++) {
187 for (int q=0; q<D; q++) {
188 for (int i=0; i<Lp; i++) {
189 f[i] = hh[U*D*i+D*p+U*q];
190 }
191 for (int i=Lp; i<fftsize; i++) {
192 f[i] = 0.0;
193 }
194 fft1time.calculate();
195 std::copy(fft1time.get_out(), fft1time.get_out()+fftsize, F+(p*D+q)*fftsize);
196 }
197 }
198
199 // Set whole fwd->in to 0
200 std::fill(fwdfft->get_in(), fwdfft->get_in()+nblocks*D*fftsize, 0.0);
201}
202
203// Class destructor
204FilterPolyphase::~FilterPolyphase(void) {
205 fftwf_free(F);
206 delete fwdfft;
207 delete invfft;
208}
209
210// Set head for next call in continuous filtering
211// For continuous filtering length of input must be divisible by D
212void FilterPolyphase::set_head(bool reset) {
213 std::complex<float>* in = fwdfft->get_in();
214 if (reset) {
215 for (int q=0; q<D; q++) {
216 int startidx = q*nblocks*fftsize;
217 int endidx = (q==0) ? startidx+Lp-1 : startidx+Lp;
218 std::fill(in+startidx, in+endidx, 0.0);
219 }
220 } else {
221 int destidx = 0;
222 int endidx = destidx+nxD+Lp-2;
223 int startidx = endidx-Lp+1;
224 std::copy(in+startidx, in+endidx, in+destidx);
225 for (int q=1; q<D; q++) {
226 destidx += nblocks*fftsize;
227 endidx = destidx+nxD+Lp-1;
228 startidx = endidx-Lp;
229 std::copy(in+startidx, in+endidx, in+destidx);
230 }
231 }
232}
233
234// Calculate output length
235// Calculate output length
236// N.B.: The implementation of the filter method below outputs
237// floor(len(input)/D)*U samples. The transient in the beginning is not
238// included in the output. This is done so that the filter method
239// may be used to filter a continuous stream of samples.
240int FilterPolyphase::out_len(void) {
241 return int(floor(double(nx)/D))*U;
242}
243
244// Do polyphase filtering
245// N.B.: This implementation introduces an additional delay of U samples at output
246int FilterPolyphase::filter(std::complex<float>* in, std::complex<float>* out) {
247
248 // Step 1 of freq-domain polyphase filtering algorithm
249 std::complex<float>* a = fwdfft->get_in();
250 int outidx = Lp-1;
251 int idx = 0;
252 for (int i=0; i<nxD-1; i++) {
253 a[outidx++] = in[idx];
254 idx += D;
255 }
256 for (int q=1; q<D; q++) {
257 outidx = q*nblocks*fftsize+Lp;
258 idx = (D-q)%D;
259 for (int i=0; i<nxD; i++) {
260 a[outidx++] = in[idx];
261 idx += D;
262 }
263 }
264 // Step 2 of freq-domain polyphase filtering algorithm
265 fwdfft->calculate();
266 // Step 3 of freq-domain polyphase filtering algorithm
267 a = fwdfft->get_out();
268 std::complex<float>* b = invfft->get_in();
269 std::complex<float>* tmp_result = (std::complex<float>*) fftwf_malloc(fftsize*sizeof(std::complex<float>));
270 int Fidx = 0;
271 for (int p=0; p<U; p++) {
272 int startu = p*nblocks*fftsize;
273 int uidx = startu;
274 int idx = 0;
275 for (int i=0; i<nblocks; i++) {
276#ifdef USE_VOLK
277 volk_32fc_x2_multiply_32fc(b+uidx, a+idx, F+Fidx, fftsize);
278#else
279 for (int j=0; j<fftsize; j++) {
280 b[uidx+j] = F[Fidx+j] * a[idx+j];
281 }
282#endif
283 uidx += fftsize;
284 idx += fftsize;
285 }
286 Fidx += fftsize;
287 for (int q=1; q<D; q++) {
288 uidx = startu;
289 for (int i=0; i<nblocks; i++) {
290#ifdef USE_VOLK
291 volk_32fc_x2_multiply_32fc(tmp_result, a+idx, F+Fidx, fftsize);
292 volk_32fc_x2_add_32fc(b+uidx, b+uidx, tmp_result, fftsize);
293#else
294 for (int j=0; j<fftsize; j++) {
295 b[uidx+j] += F[Fidx+j] * a[idx+j];
296 }
297#endif
298 uidx += fftsize;
299 idx += fftsize;
300 }
301 Fidx += fftsize;
302 }
303 }
304 // Step 4 of freq-domain polyphase filtering algorithm
305 invfft->calculate();
306 // Steps 5 and 6 of freq-domain polyphase filtering algorithm
307 b = invfft->get_out();
308 outidx = 0;
309 int olen = this->out_len();
310 for (int i=0; i<nblocks; i++) {
311 for (int j=0; j<M and outidx<olen; j++) {
312 int uidx = i*fftsize+j+Lp-1;
313 for (int p=0; p<U and outidx<olen; p++) {
314 out[outidx++] = b[uidx];
315 uidx += fftsize*nblocks;
316 }
317 }
318 }
319 // Cleanup
320 fftwf_free(tmp_result);
321 return olen;
322}
FFT size selection heuristic: Choose \(m\) to be the minimum integer such that \(2^m \geq 8 L\) in
FilterOverlapSave
(\(2^m \geq 8 L_p\) inFilterPolyphase
), and set the FFT size to be \(2^m\). This selection makes sure that the condition that \(2^m \gg L\) (\(2^m \gg L_p)\). If \(L\) (\(L_p\)) is long enough, say \(L\) (\(L_p\)) \(\geq 32\), we also have \(L \gg mD\) (\(L_p \gg m\)).Since the
fft
class is used, it is nice to run the methodfft:cleanup()
beforemain()
exits when using the filter classes.Usage example:
test_filters.cpp
Experiment
Study the source code for
FilterOverlapSave
andFilterPolyphase
.While I have tried to use the FFTW3 interfaces very efficiently and VOLK, the other operations required in the frequency-domain filtering implementations are not exactly optimized. Try to work on developing your own implementation to see if you can beat mine:
For example, one may be able to do a better job in structuring the memory storage organization of the input and output, in particular in
FilterPolyphase
, to better faciltiate SIMD acceleration.Another possibility for speed-up is to write multi-threaded implementations for the “non-FFT” steps in the frequency-domain filtering algorithms.
Try to see if you can develop a “better” approach to choose
fftsize
.