Fast Fourier Transform with x86-64 assembly language
This is an old application I did a while ago. I did this in 2005 when I got my first 64bit CPU (AMD).
The first I did after installing my new CPU was to open VI and start coding an FFT using 64 bit registers. This is old news, but 64 bit at that time was awesome. Not only can you store 64 bits in a register, but you get 32 general purpose registers!
The only really annoying thing with this architecture is that they don't provide a bit reveral instruction. I don't understand why a simple RISC processor like the AVR32 (lookup "brev") has one but not a high end CISC like Intel or AMD. I don't actually show the bit reveral part of the FFT in here though.
By the way, I remember doing some tests with this algorithm and, although I don't remember the results exactly (7 years ago), I remember that it was running at least 5 times faster than most other FFTs in other libraries.
//; x8664realfft(float* source,float** spectrum,long size)
x8664realifft:
mov $1,%eax
cvtsi2ss %eax,%xmm10
pshufd $0b00000000,%xmm10,%xmm10
mov $-1,%eax
cvtsi2ss %eax,%xmm10
pshufd $0b11000100,%xmm10,%xmm10
jmp fftentry
x8664realfft:
mov $1,%eax
cvtsi2ss %eax,%xmm10
pshufd $0b00000000,%xmm10,%xmm10
fftentry:
pushq %rbp
movq %rsp,%rbp
pushq %rbp
subq $0xFF,%rsp
movq %rsp,%rbp
//; make a 16bytes aligned buffer
addq $16,%rbp
andq $0xFFFFFFFFFFFFFFF0,%rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %r11
pushq %r10
pushq %r9
pushq %r8
//; rcx = size
movq %rdx,%rcx
pushq %rcx
//; rdx = source
mov %rdi,%rdx
pushq %rdx
//; rdi = spectrum[0]
movq (%rsi), %rdi
addq $8, %rsi
//; rsi = spectrum[1]
movq (%rsi), %rsi
//; r8 = log2(N), r14= N
pushq %rcx
fld1
fild (%rsp)
xorq %r8,%r8
pushq %r8
fyl2x
fistp (%rsp)
popq %r8
popq %r14
//; bit reversal has already been done prior to calling this function
//; r9 = nLargeSpectrum
//; r10 = nPointsLargeSpectrum
movq %r14,%r9
movq $1,%r10
movq $1,%r11
mov %rdi,%r14
mov %rsi,%r15
//;load 2PI in st(0)
fldpi
fldpi
faddp %st(0),%st(1)
movq %r8,%rcx
l1: pushq %rcx
shrq $1,%r9
shlq $1,%r10
//;st(0) = theta, st(1) = 2pi
fld %st(0)
pushq %r10
fidiv (%rsp)
popq %r10
//;xmm0 = 2*costheta[0],2*costheta[0],2*costheta[0],2*costheta[0]
//; st(0) = theta, st(1) = 2pi
pushq %rax
fld %st(0)
fcos
fstp (%rsp)
movss (%rsp),%xmm0
pshufd $0b00000000,%xmm0,%xmm0
popq %rax
addps %xmm0,%xmm0
movq %r9,%rcx
l2: pushq %rcx
//; r12 = point1 (index *4bytes) r13 = point2 (index *4bytes)
movq %r10,%r12
movq %r9,%rax
subq %rcx,%rax
pushq %rdx
mulq %r12
popq %rdx
movq %rax,%r12
movq %r11,%r13
addq %r12,%r13
shlq $2,%r13
shlq $2,%r12
//; xmm2 = costheta[2],sintheta[2],costheta[1],sintheta[1]
movq %r12,16(%rbp)
decq 16(%rbp)
fld %st(0)
fimul 16(%rbp)
fsincos
fstp (%rbp)
fstp 4(%rbp)
decq 16(%rbp)
fld %st(0)
fimul 16(%rbp)
fsincos
fstp 8(%rbp)
fstp 12(%rbp)
movaps (%rbp),%xmm2
pshufd $0b10110001 ,%xmm2,%xmm2
//;xmm1 = costheta[1],sintheta[1],0,0
movhlps %xmm2,%xmm1
movq %r11,%rcx
l3:
//; recurrence formula
//; xmm3 = w.re,w.im,w.re,w.im
movaps %xmm2,%xmm3
mulps %xmm0,%xmm3
subps %xmm1,%xmm3
movlhps %xmm3,%xmm3
movaps %xmm2,%xmm1
movaps %xmm3,%xmm2
mulps %xmm10,%xmm3
//; xmm5 := c.im,c.re,c.re,c.im
movq %r14,%rdi
movq %r15,%rsi
addq %r13,%rdi
addq %r13,%rsi
movss (%rdi),%xmm5
pshufd $0b00000011,%xmm5,%xmm5
addss (%rsi),%xmm5
pshufd $0b00101000,%xmm5,%xmm5
//; xmm3 := inner product: re,re,im,im
mulps %xmm3,%xmm5
pshufd $0b11011101 ,%xmm5,%xmm3
pshufd $0b10001000 ,%xmm5,%xmm5
addsubps %xmm5,%xmm3
pshufd $0b10101111,%xmm3,%xmm3
//;xmm6 := sortedArray[point1].re,sortedArray[point1].re,sortedArray[point1].im,sortedArray[point1].im
movq %r14,%rdi
movq %r15,%rsi
addq %r12,%rdi
addq %r12,%rsi
movss (%rdi),%xmm6
pshufd $0b00001111,%xmm6,%xmm6
addss (%rsi),%xmm6
pshufd $0b11100000,%xmm6,%xmm6
addsubps %xmm3,%xmm6
pshufd $0b00100111,%xmm6,%xmm6
movss %xmm6,(%rdi)
pshufd $0b11100001,%xmm6,%xmm6
movss %xmm6,(%rsi)
movq %r14,%rdi
movq %r15,%rsi
addq %r13,%rdi
addq %r13,%rsi
pshufd $0b01001110,%xmm6,%xmm6
movss %xmm6,(%rdi)
pshufd $0b11100001,%xmm6,%xmm6
movss %xmm6,(%rsi)
//; increase point1 and point2 by 4 bytes (each index represent a float)
addq $4,%r12
addq $4,%r13
decq %rcx
jnz l3
popq %rcx
decq %rcx
jnz l2
//; remove theta from fpu stack
fstp %st(0)
shlq $1,%r11
popq %rcx
decq %rcx
jnz l1
popq %rdx
//; rcx is already pushed in stack
cvtsi2ss (%rsp),%xmm1
pshufd $0b00000000,%xmm1,%xmm1
popq %rcx
shrq $2,%rcx
movq %r14,%rdi
movq %r15,%rsi
//; is this a ifft or a fft?
cvtss2si %xmm10,%eax
cmp $-1,%eax
jne nrm
cp: movaps (%rdi),%xmm2
movntdq %xmm2,(%rdx)
addq $16,%rdi
addq $16,%rdx
loop cp
jmp cleanexit
nrm:
movaps (%rdi),%xmm2
movaps (%rsi),%xmm3
divps %xmm1,%xmm2
divps %xmm1,%xmm3
movntdq %xmm2,(%rdi)
movntdq %xmm3,(%rsi)
addq $16,%rdi
addq $16,%rsi
loop nrm
cleanexit:
fstp %st(0)
popq %r8
popq %r9
popq %r10
popq %r11
popq %r12
popq %r13
popq %r14
popq %r15
addq $0xFF,%rsp
popq %rbp
leave
ret