3.6.4. filters.cpp#

3.6.4.1. Implementation#

  1. Multi-rate direct overlap-save filter class: FilterOverlapSave

  2. Polyphase overlap-save filter class: FilterPolyphase

3.6.4.2. Source code#

  1// University of Florida EEL6528
  2// Tan F. Wong
  3// Feb 2, 2021
  4
  5#include "filters.hpp"
  6
  7// OverlapSave filter class
  8// Set up FFT objects and filter's frequency response
  9void FilterOverlapSave::setup(int x_len, int h_len, std::complex<float>* h, int nt)
 10{
 11    // Set up sizes for the overlap-save algorithm
 12    nx = x_len;
 13    Unx = nx*U;
 14    if (Unx%D != 0)
 15        printf("[WARNING] For proper continuous filtering, input_length*U must be divisible by D!\n");
 16    L = h_len;
 17    double LL = (L<32) ? 32.0 : double(L); // If L too small, use 32
 18    // Choose FFT size to be 2^m, where m is smallest power that 2^m >= 8*L
 19    fftsize = int(pow(2.0, ceil(log2(LL)+3)));
 20    
 21    M = fftsize-L+1;
 22    // Work out the number of FFT blocks needed
 23    nblocks = int(ceil(double(Unx)/M));
 24    // set number of threads to use 
 25    nthreads = nt;
 26    
 27    // Set up forward and inverse FFT objects
 28    fwdfft = new fft(nthreads, fftsize, nblocks, M, fftsize, false);
 29    invfft = new fft(nthreads, fftsize, nblocks, fftsize, fftsize, true, fwdfft->get_out());
 30
 31    // Calculate and save FFT of h
 32    
 33    fft fft1time(1, fftsize, 1, 0, 0, false);
 34    std::complex<float>* a = fft1time.get_in();
 35    for (int i=0; i<L; i++) {
 36        a[i] = h[i];
 37    }
 38    for (int i=L; i<fftsize; i++) {
 39        a[i] = 0.0;
 40    }
 41    fft1time.calculate();
 42    H = (std::complex<float>*) fftwf_malloc(fftsize*sizeof(std::complex<float>));
 43    std::copy(fft1time.get_out(), fft1time.get_out()+fftsize, H);
 44
 45    // Set whole fwd->in to 0
 46    std::fill(fwdfft->get_in(), fwdfft->get_in()+nblocks*fftsize, 0.0);
 47}
 48
 49// Set head for next call in continuous filtering
 50// For continuous filtering U*(length of input) must be divisible by D
 51void FilterOverlapSave::set_head(bool reset) {
 52    std::complex<float>* in = fwdfft->get_in();
 53    if (reset)
 54        std::fill(in, in+L-1, 0.0);
 55    else
 56        std::copy(in+Unx, in+Unx+L-1, in);
 57}
 58
 59// Single-rate FilterOverlapSave class constructor (U=D=1)
 60FilterOverlapSave::FilterOverlapSave(int x_len, int h_len, std::complex<float>* h, int nthreads)
 61{
 62    // Set up upsampling and downsampling rates to 1
 63    U = 1;
 64    D = 1;
 65    // Set up FFTs and filter's freq response
 66    this->setup(x_len, h_len, h, nthreads);
 67}
 68
 69// Multi-rate FilterOverlapSave class constructor
 70FilterOverlapSave::FilterOverlapSave(int up, int down, int x_len, int h_len, std::complex<float>* h, int nthreads)
 71{
 72    // Set up upsampling and downsampling rates to specified values 
 73    U = up;
 74    D = down;
 75    // Set up FFTs and filter's freq response
 76    this->setup(x_len, h_len, h, nthreads);
 77}
 78
 79// Class destructor
 80FilterOverlapSave::~FilterOverlapSave(void) {
 81    fftwf_free(H);
 82    delete fwdfft;
 83    delete invfft;
 84}
 85
 86// Calculate output length
 87// N.B.: The implementation of the filter method below outputs
 88// floor(U*len(input)/D) samples. The transient in the beginning is not
 89// included in the output. This is done so that the filter method 
 90// may be used to filter a continuous stream of samples.
 91int FilterOverlapSave::out_len(void) {
 92    return int(floor(double(Unx)/D));
 93}
 94    
 95// Filter the input in and obtain output out
 96int FilterOverlapSave::filter(std::complex<float>* in, std::complex<float>* out)
 97{
 98    // Upsampling in by U
 99    std::complex<float>* up_in = fwdfft->get_in();
100    if (U==1) {
101        std::copy(in, in+nx, up_in+L-1);
102    } else {
103        int idx = L-1;
104        for (int i=0; i<nx; i++) {
105            up_in[idx] = in[i];
106            idx += U;
107        }
108    }
109
110    // Implement overlap-save
111    fwdfft->calculate();
112    std::complex<float>* transformed = fwdfft->get_out();
113    for (int i=0; i<nblocks; i++) {
114        int idx = i*fftsize;
115#ifdef USE_VOLK
116        volk_32fc_x2_multiply_32fc(transformed+idx, transformed+idx, H, fftsize);
117#else
118        for (int j=0; j<fftsize; j++) {
119            transformed[idx++] *= H[j];
120        }
121#endif
122    }
123    invfft->calculate();
124
125    std::complex<float>* y = invfft->get_out();
126    // Discard overlap output corrupted by circular conv.
127    int idx = L-1;
128    int oidx = 0;
129    std::complex<float>* outbuf = transformed;
130    if (D==1) outbuf = out; // put to output array directly
131    for (int i=0; i<nblocks-1; i++) {
132        std::copy(y+idx, y+idx+M, outbuf+oidx);
133        idx += fftsize;
134        oidx += M;
135    }
136    // Last block
137    int nlast = Unx-(nblocks-1)*M;
138    std::copy(y+idx, y+idx+nlast, outbuf+oidx);
139    int nout = this->out_len();
140    if (D>1) { 
141        // Downsampling filtered by D
142        for (int i=0; i<nout; i++) {
143            out[i] = outbuf[i*D];
144        }
145    }
146    return nout;
147}
148
149// Polyphase filter class
150// class constructor
151FilterPolyphase::FilterPolyphase(int up, int down, int x_len, int h_len, std::complex<float>* h, int nt) {
152    // Set up upsampling and downsampling rates to specified values 
153    U = up;
154    D = down;
155
156    nx = x_len; // length of input signal
157    if (nx%D != 0)
158        printf("[WARNING] For proper continuous filtering, input_length must be divisible by D!\n");
159    nxD = int(floor(double(nx)/D))+1; // length of down-sampled input signals
160    // Set up lengths for the polyphase overlap-save algorithm
161    L = h_len; // Length of orginal filter in up-sampled domain
162    Lp = int(ceil(double(L)/U/D))+1; // length of polyphase filters
163    double LL = (Lp<32) ? 32.0 : double(Lp); // If L too small, use 32
164    // Choose FFT size to be 2^m, where m is smallest power that 2^m >= 8*L
165    fftsize = int(pow(2.0, ceil(log2(LL)+2)));
166    M = fftsize-Lp+1; //Length of data block for each polyphase filter
167
168    // Work out the number of FFT blocks needed per down-sampled branch 
169    nblocks = int(ceil(double(nxD)/M));
170    // set number of threads to use 
171    nthreads = nt;
172    
173    // Set up forward and inverse FFT objects
174    fwdfft = new fft(nthreads, fftsize, nblocks, M, fftsize, D, false);
175    invfft = new fft(nthreads, fftsize, nblocks, fftsize, fftsize, U, true);
176
177    // Calculate and save FFTs of polyphase filters' impulse responses
178    F = (std::complex<float>*) fftwf_malloc(U*D*fftsize*sizeof(std::complex<float>));
179    // Append zeros to the beginning of h to get hh for convenience
180    // Also hh[n] = h[n-U*D]
181    std::complex<float> hh[fftsize*U*D];
182    std::copy(h, h+L, hh+U*D);
183    fft fft1time(1, fftsize, 1, 0, 0, false);
184    std::complex<float>* f = fft1time.get_in();
185    // Note that f[n] = f_{pq}[n-1] below
186    for (int p=0; p<U; p++) {
187        for (int q=0; q<D; q++) {
188            for (int i=0; i<Lp; i++) {
189                f[i] = hh[U*D*i+D*p+U*q];
190            }
191            for (int i=Lp; i<fftsize; i++) {
192                f[i] = 0.0;
193            }
194            fft1time.calculate();
195            std::copy(fft1time.get_out(), fft1time.get_out()+fftsize, F+(p*D+q)*fftsize);
196        }
197    }
198
199    // Set whole fwd->in to 0
200    std::fill(fwdfft->get_in(), fwdfft->get_in()+nblocks*D*fftsize, 0.0);
201}
202
203// Class destructor
204FilterPolyphase::~FilterPolyphase(void) {
205    fftwf_free(F);
206    delete fwdfft;
207    delete invfft;
208}
209
210// Set head for next call in continuous filtering
211// For continuous filtering length of input must be divisible by D
212void FilterPolyphase::set_head(bool reset) {
213    std::complex<float>* in = fwdfft->get_in();
214    if (reset) {
215        for (int q=0; q<D; q++) {
216            int startidx = q*nblocks*fftsize;
217            int endidx = (q==0) ? startidx+Lp-1 : startidx+Lp;
218            std::fill(in+startidx, in+endidx, 0.0);
219        }
220    } else {
221        int destidx = 0;
222        int endidx = destidx+nxD+Lp-2;
223        int startidx = endidx-Lp+1;
224        std::copy(in+startidx, in+endidx, in+destidx);
225        for (int q=1; q<D; q++) {
226            destidx += nblocks*fftsize;
227            endidx = destidx+nxD+Lp-1;
228            startidx = endidx-Lp;
229            std::copy(in+startidx, in+endidx, in+destidx);
230        }
231    }
232}
233
234// Calculate output length
235// Calculate output length
236// N.B.: The implementation of the filter method below outputs
237// floor(len(input)/D)*U samples. The transient in the beginning is not
238// included in the output. This is done so that the filter method 
239// may be used to filter a continuous stream of samples.
240int FilterPolyphase::out_len(void) {
241    return int(floor(double(nx)/D))*U;
242}
243
244// Do polyphase filtering
245// N.B.: This implementation introduces an additional delay of U samples at output
246int FilterPolyphase::filter(std::complex<float>* in, std::complex<float>* out) {
247    
248    // Step 1 of freq-domain polyphase filtering algorithm
249    std::complex<float>* a = fwdfft->get_in();
250    int outidx = Lp-1;
251    int idx = 0;
252    for (int i=0; i<nxD-1; i++) {
253        a[outidx++] = in[idx];
254        idx += D;
255    }
256    for (int q=1; q<D; q++) {
257        outidx = q*nblocks*fftsize+Lp;
258        idx = (D-q)%D;
259        for (int i=0; i<nxD; i++) {
260            a[outidx++] = in[idx];
261            idx += D;
262        }
263    }
264    // Step 2 of freq-domain polyphase filtering algorithm
265    fwdfft->calculate();
266    // Step 3 of freq-domain polyphase filtering algorithm
267    a = fwdfft->get_out();
268    std::complex<float>* b = invfft->get_in();
269    std::complex<float>* tmp_result = (std::complex<float>*) fftwf_malloc(fftsize*sizeof(std::complex<float>));
270    int Fidx = 0;
271    for (int p=0; p<U; p++) {
272        int startu = p*nblocks*fftsize;
273        int uidx = startu;
274        int idx = 0;
275        for (int i=0; i<nblocks; i++) {
276#ifdef USE_VOLK
277            volk_32fc_x2_multiply_32fc(b+uidx, a+idx, F+Fidx, fftsize);
278#else
279            for (int j=0; j<fftsize; j++) {
280                b[uidx+j] = F[Fidx+j] * a[idx+j];
281            }        
282#endif
283            uidx += fftsize;
284            idx += fftsize;
285        }
286        Fidx += fftsize;
287        for (int q=1; q<D; q++) {
288            uidx = startu;
289            for (int i=0; i<nblocks; i++) {
290#ifdef USE_VOLK
291                volk_32fc_x2_multiply_32fc(tmp_result, a+idx, F+Fidx, fftsize);
292                volk_32fc_x2_add_32fc(b+uidx, b+uidx, tmp_result, fftsize);
293#else
294                for (int j=0; j<fftsize; j++) {
295                    b[uidx+j] += F[Fidx+j] * a[idx+j];
296                }     
297#endif
298                uidx += fftsize;
299                idx += fftsize;
300            }
301            Fidx += fftsize;
302        }
303    }
304    // Step 4 of freq-domain polyphase filtering algorithm
305    invfft->calculate();
306    // Steps 5 and 6 of freq-domain polyphase filtering algorithm
307    b = invfft->get_out();
308    outidx = 0;
309    int olen = this->out_len();
310    for (int i=0; i<nblocks; i++) {
311        for (int j=0; j<M and outidx<olen; j++) {
312            int uidx = i*fftsize+j+Lp-1;
313            for (int p=0; p<U and outidx<olen; p++) {
314                out[outidx++] = b[uidx];
315                uidx += fftsize*nblocks;
316            }
317        }
318    }
319    // Cleanup
320    fftwf_free(tmp_result);
321    return olen;
322}
  • FFT size selection heuristic: Choose \(m\) to be the minimum integer such that \(2^m \geq 8 L\) in FilterOverlapSave (\(2^m \geq 8 L_p\) in FilterPolyphase), and set the FFT size to be \(2^m\). This selection makes sure that the condition that \(2^m \gg L\) (\(2^m \gg L_p)\). If \(L\) (\(L_p\)) is long enough, say \(L\) (\(L_p\)) \(\geq 32\), we also have \(L \gg mD\) (\(L_p \gg m\)).

  • Since the fft class is used, it is nice to run the method fft:cleanup() before main() exits when using the filter classes.

  • Usage example: test_filters.cpp

Experiment

  1. Study the source code for FilterOverlapSave and FilterPolyphase.

  2. While I have tried to use the FFTW3 interfaces very efficiently and VOLK, the other operations required in the frequency-domain filtering implementations are not exactly optimized. Try to work on developing your own implementation to see if you can beat mine:

    • For example, one may be able to do a better job in structuring the memory storage organization of the input and output, in particular in FilterPolyphase, to better faciltiate SIMD acceleration.

    • Another possibility for speed-up is to write multi-threaded implementations for the “non-FFT” steps in the frequency-domain filtering algorithms.

  3. Try to see if you can develop a “better” approach to choose fftsize.