[ freeverb3-vst.sourceforge.io | freeverb3@Savannah | sourceforge.net ]

Optimization Tips

Convolution

The convolution algorithm of Freeverb3 uses some techniques to achieve fast processing. This article introduces some of them.

Array rearrangement

You should calculate the (a+bi)*(c+di)=(ac-bd)+(ad+bc)i loop in the frequency domain to do the convolution.

We can use Halfcomplex-format DFT of the FFTW3 library since we do not need to calculate imaginary numbers.

float *in, *out;
in = fftwf_malloc(sizeof(float)*size);
out = fftwf_malloc(sizeof(float)*size);
fftwf_plan p;
p = fftwf_plan_r2r_1d(size, in, out, FFTW_R2HC, FFTW_ESTIMATE);
fftwf_execute(p);
fftwf_destroy_plan(p);
fftwf_free(in);
fftwf_free(out);
    

Reverse transform

fftwf_plan p;
p = fftwf_plan_r2r_1d(size, in, out, FFTW_HC2R, FFTW_ESTIMATE);
fftwf_execute(p);
fftwf_destroy_plan(p);

After the DFT calculation, you will get the following arrays.

r0, r1, r2, ..., rn/2, i(n+1)/2-1, ..., i2, i1

So, you should run the following code to do the convolution.

out[0] = x[0] * y[0]; // r0
out[n/2] = x[n/2] * y[n/2]; // r(n/2)
for (j = 1; j < n/2; j++)
{
  float a = x[j];
  float b = x[n - j];
  float c = y[j];
  float d = y[n - j];
  out[j] += a*c-b*d; // Re
  out[n - j] += a*d+b*c; // Im
}

The code's problem is that it reads the memory backward and forward and the memory cache cannot follow the code's read timings. To resolve this problem, we should rearrange the arrays using the following code.

out[0] = orig[0];
out[1] = orig[n/2];
for(int t = 1;t < n/2;t ++)
{
  out[2*t+0] = orig[t];
  out[2*t+1] = orig[n-t];
}

Then, we can get the following arrays.

r0, rn/2, r1, i1, r2, i2, ..., r(n+1)/2-1, i(n+1)/2-1

The following is the memory read direction optimized code.

out[0] = x[0] * y[0]; // r0
out[1] = x[1] * y[1]; // r(n/2)
for (j = 1; j < n/2; j++)
{
  float a = x[j*2];
  float b = x[j*2+1];
  float c = y[j*2];
  float d = y[j*2+1];
  out[j*2] += a*c-b*d; // Re
  out[j*2+1] += a*d+b*c; // Im
}

Performance Tools

You can use gprof if you are using GCC. You should search the hot spot before you optimize your code using SIMD and so on. Otherwise, your effort may not work well.

SIMD optimization

Next, we can optimize this code using SIMD. You can learn AoS(Arrays-of-Structure) and SoA(Structure-of-Arrays) from the Intel or AMD's web site, so please read them if you want to know more about them.

struct
{
  float A[1000], B[1000], C[1000];
} SoA_data;

struct
{
  float A, B, C;
} AoS_data[1000];

The following data tructure may be preferred if you are using 3DNow! or SSE.

struct
{
  float Re[2], Im[2]; // 16 byte
} Complex_SoA_3DNOW[N/2];
struct
{
  float Re[4]; // 16byte alignment for movaps
  float Im[4]; // 16byte alignment for movaps
} Complex_SoA_SSE[N/4];

You can use the following code to rearrange the data arrays.

// 3DNow!
// sizeof(out[] = orig[])
out[0] = orig[0]; // r0
out[1] = orig[1]; // r1
out[2] = orig[n/2]; // r(n/2)
out[3] = orig[n-1]; // i1
for(int t = 1;t < n/4;t ++)
{
  out[4*t+0] = orig[2*t];
  out[4*t+1] = orig[2*t+1];
  out[4*t+2] = orig[n-2*t];
  out[4*t+3] = orig[n-2*t-1];
}
// SSE
// sizeof(out[] = orig[])
out[0] = orig[0]; // r0
out[1] = orig[1]; // r1
out[2] = orig[2]; // r2
out[3] = orig[3]; // r3
out[4+0] = orig[n/2]; // r(n/2)
out[4+1] = orig[n-1]; // i1
out[4+2] = orig[n-2]; // i2
out[4+3] = orig[n-3]; // i3
for(int t = 1;t < n/8;t ++)
{
  out[8*t+0] = orig[4*t];
  out[8*t+1] = orig[4*t+1];
  out[8*t+2] = orig[4*t+2];
  out[8*t+3] = orig[4*t+3];
  out[8*t+4+0] = orig[n-4*t];
  out[8*t+4+1] = orig[n-4*t-1];
  out[8*t+4+2] = orig[n-4*t-2];
  out[8*t+4+3] = orig[n-4*t-3];
}

Now we can optimize the code using SIMD. The following pseudo-code is the SIMD calculation of (a+bi)*(c+di)=(ac-bd)+(ad+bc)i.

# source,dest
# 3DNow!
femms
for(...){
movq  x.Re, mm0 # mm0 = a
movq  mm0,  mm2 # mm2 = a
movq  y.Re, mm1 # mm1 = c
pfmul mm1,  mm0 # mm0 = a*c
movq  x.Im, mm4 # mm4 = b
movq  mm4,  mm3 # mm3 = b
pfmul mm1,  mm3 # mm3 = b*c
movq  y.Im, mm5 # mm5 = d
pfmul mm5,  mm2 # mm2 = a*d
pfmul mm5,  mm4 # mm4 = b*d
pfsub mm4,  mm0 # mm0 = a*c - b*d
pfadd out.Re, mm0 # mm0 = Re + a*c - b*d
movq  mm0, out.Re # Re = Re + a*c - b*d
pfadd mm3,  mm2 # mm2 = a*d - b*c
pfadd out.Im, mm2 # mm2 = Im + a*d - b*c
movq  mm2, out.Im # Im = Im + a*d - b*c
}
femms
# SSE
for(...){
movaps x.Re, xmm0 # xmm0 = a
movaps xmm0, xmm2 # xmm2 = a
movaps y.Re, xmm1 # xmm1 = c
mulps  xmm1, xmm0 # xmm0 = a*c
movaps x.Im, xmm4 # xmm4 = b
movaps xmm4, xmm3 # xmm3 = b
mulps  xmm1, xmm3 # xmm3 = b*c
movaps y.Im, xmm5 # xmm5 = d
mulps  xmm5, xmm2 # xmm2 = a*d
mulps  xmm5, xmm4 # xmm4 = b*d
subps  xmm4, xmm0 # xmm0 = a*c - b*d
addps  out.Re, xmm0 # xmm0 = Re + a*c - b*d
movaps xmm0, out.Re # Re = Re + a*c - b*d
addps  xmm3, xmm2 # xmm2 = a*d - b*c
addps  out.Im, xmm2 # xmm2 = Im + a*d - b*c
movaps xmm2, out.Im # Im = Im + a*d - b*c
}

Valid XHTML 1.1 Valid CSS!

[ freeverb3-vst.sourceforge.io | freeverb3@Savannah | sourceforge.net ]