Freeverb3 - Optimization Tips

The convolution algorithm of Freeverb3 uses some techniques to achieve fast processing. This article introduces some of them.

Array rearrangement

You should calculate the (a+bi)*(c+di)=(ac-bd)+(ad+bc)i loop in the frequency domain to do the convolution.

We can use Halfcomplex-format DFT of the FFTW3 library since we do not need to calculate imaginary numbers.

float *in, *out;
in = fftwf_malloc(sizeof(float)*size);
out = fftwf_malloc(sizeof(float)*size);
fftwf_plan p;
p = fftwf_plan_r2r_1d(size, in, out, FFTW_R2HC, FFTW_ESTIMATE);
fftwf_execute(p);
fftwf_destroy_plan(p);
fftwf_free(in);
fftwf_free(out);

fftwf_plan p;
p = fftwf_plan_r2r_1d(size, in, out, FFTW_HC2R, FFTW_ESTIMATE);
fftwf_execute(p);
fftwf_destroy_plan(p);

out[0] = x[0] * y[0]; // r0
out[n/2] = x[n/2] * y[n/2]; // r(n/2)
for (j = 1; j < n/2; j++)
{
  float a = x[j];
  float b = x[n - j];
  float c = y[j];
  float d = y[n - j];
  out[j] += a*c-b*d; // Re
  out[n - j] += a*d+b*c; // Im
}

The code's problem is that it reads the memory backward and forward and the memory cache cannot follow the code's read timings. To resolve this problem, we should rearrange the arrays using the following code.

out[0] = orig[0];
out[1] = orig[n/2];
for(int t = 1;t < n/2;t ++)
{
  out[2*t+0] = orig[t];
  out[2*t+1] = orig[n-t];
}

out[0] = x[0] * y[0]; // r0
out[1] = x[1] * y[1]; // r(n/2)
for (j = 1; j < n/2; j++)
{
  float a = x[j*2];
  float b = x[j*2+1];
  float c = y[j*2];
  float d = y[j*2+1];
  out[j*2] += a*c-b*d; // Re
  out[j*2+1] += a*d+b*c; // Im
}

Performance Tools

You can use gprof if you are using GCC. You should search the hot spot before you optimize your code using SIMD and so on. Otherwise, your effort may not work well.

SIMD optimization

Next, we can optimize this code using SIMD. You can learn AoS(Arrays-of-Structure) and SoA(Structure-of-Arrays) from the Intel or AMD's web site, so please read them if you want to know more about them.

struct
{
  float A[1000], B[1000], C[1000];
} SoA_data;

struct
{
  float A, B, C;
} AoS_data[1000];

struct
{
  float Re[2], Im[2]; // 16 byte
} Complex_SoA_3DNOW[N/2];
struct
{
  float Re[4]; // 16byte alignment for movaps
  float Im[4]; // 16byte alignment for movaps
} Complex_SoA_SSE[N/4];

// 3DNow!
// sizeof(out[] = orig[])
out[0] = orig[0]; // r0
out[1] = orig[1]; // r1
out[2] = orig[n/2]; // r(n/2)
out[3] = orig[n-1]; // i1
for(int t = 1;t < n/4;t ++)
{
  out[4*t+0] = orig[2*t];
  out[4*t+1] = orig[2*t+1];
  out[4*t+2] = orig[n-2*t];
  out[4*t+3] = orig[n-2*t-1];
}
// SSE
// sizeof(out[] = orig[])
out[0] = orig[0]; // r0
out[1] = orig[1]; // r1
out[2] = orig[2]; // r2
out[3] = orig[3]; // r3
out[4+0] = orig[n/2]; // r(n/2)
out[4+1] = orig[n-1]; // i1
out[4+2] = orig[n-2]; // i2
out[4+3] = orig[n-3]; // i3
for(int t = 1;t < n/8;t ++)
{
  out[8*t+0] = orig[4*t];
  out[8*t+1] = orig[4*t+1];
  out[8*t+2] = orig[4*t+2];
  out[8*t+3] = orig[4*t+3];
  out[8*t+4+0] = orig[n-4*t];
  out[8*t+4+1] = orig[n-4*t-1];
  out[8*t+4+2] = orig[n-4*t-2];
  out[8*t+4+3] = orig[n-4*t-3];
}

Now we can optimize the code using SIMD. The following pseudo-code is the SIMD calculation of (a+bi)*(c+di)=(ac-bd)+(ad+bc)i.

# source,dest
# 3DNow!
femms
for(...){
movq  x.Re, mm0 # mm0 = a
movq  mm0,  mm2 # mm2 = a
movq  y.Re, mm1 # mm1 = c
pfmul mm1,  mm0 # mm0 = a*c
movq  x.Im, mm4 # mm4 = b
movq  mm4,  mm3 # mm3 = b
pfmul mm1,  mm3 # mm3 = b*c
movq  y.Im, mm5 # mm5 = d
pfmul mm5,  mm2 # mm2 = a*d
pfmul mm5,  mm4 # mm4 = b*d
pfsub mm4,  mm0 # mm0 = a*c - b*d
pfadd out.Re, mm0 # mm0 = Re + a*c - b*d
movq  mm0, out.Re # Re = Re + a*c - b*d
pfadd mm3,  mm2 # mm2 = a*d - b*c
pfadd out.Im, mm2 # mm2 = Im + a*d - b*c
movq  mm2, out.Im # Im = Im + a*d - b*c
}
femms
# SSE
for(...){
movaps x.Re, xmm0 # xmm0 = a
movaps xmm0, xmm2 # xmm2 = a
movaps y.Re, xmm1 # xmm1 = c
mulps  xmm1, xmm0 # xmm0 = a*c
movaps x.Im, xmm4 # xmm4 = b
movaps xmm4, xmm3 # xmm3 = b
mulps  xmm1, xmm3 # xmm3 = b*c
movaps y.Im, xmm5 # xmm5 = d
mulps  xmm5, xmm2 # xmm2 = a*d
mulps  xmm5, xmm4 # xmm4 = b*d
subps  xmm4, xmm0 # xmm0 = a*c - b*d
addps  out.Re, xmm0 # xmm0 = Re + a*c - b*d
movaps xmm0, out.Re # Re = Re + a*c - b*d
addps  xmm3, xmm2 # xmm2 = a*d - b*c
addps  out.Im, xmm2 # xmm2 = Im + a*d - b*c
movaps xmm2, out.Im # Im = Im + a*d - b*c
}

Optimization Tips

Convolution

Array rearrangement

Performance Tools

SIMD optimization