TITLE   "FFT, IFFT, etc. for 32-bit Flat model."
;       ffts32.asm --- single precision fft algorithms
;       (C) 1997,1998,2000. C. Bond. All rights reserved.
;       NOTE: for '586 or higher processors only!
;
;
;       public routines:
;
;       void sfft(float a[], int N, int flag);
;               complex fft, 'N' point fft of N complex numbers
;               stored in ((float)real,(float imag)) format.
;               Array a[] has 2*N elements of size 'float'.
;               Set flag = +1 for forward fft, -1 for inverse fft.
;
;       void csfft(complex <float> a[], int N, int flag);
;               complex fft, 'N' point fft of N complex numbers,
;               flag = +1 for forward fft, -1 for inverse fft.
;               This routine is physically the same as 'sfft',
;               but labeled differently to accomodate the
;               different calling conventions.
;
;       void csfft2(complex <float> a[], int N, int flag);
;               same as 'sfft' except that lookup tables
;               farg and fargm are used instead of computing
;               sines within the routine. currently limited
;               to 2^19 point fft.
;
;       void cmulvec(complex <float> a[],complex <float> b[], int N);
;               multiply two complex vectors, element by element,
;               with result going to first vector. (a[j] <- a[j]*b[j]).
;               operates on vectors with 'N' complex elements.
;               uses reduced number of multiplications.
;
;       void cmvec(complex <float> a[], complex <float> b[], int N);
;               same as above but using straightforward computation.
;               actually exhibits superior performance on Pentium.
;
;       void cjmulvec(complex <float> a[], complex <float> b[], int N);
;               multiply complex vector by conjugate of complex vector,
;               with result going to first vector.
;
;       void cjmvec(complex <float> a[], complex <float> b[], int N);
;               same as above, except that straightforward computation
;               is used. performs better than previous routine on Pentium.
;
;       int cdivvec(complex <float> a[], complex <float> b[], int N);
;               element by element division of a[] by b[] with result
;               to a[]. returns number of divide-by-zero events
;               encountered. hence, a zero return value signals successful
;               completion.
;
;       void cmag(complex <float> a[], float b[], int N);
;               returns element by element magnitude of complex
;               vector a[] in real vector b[]. Number of complex
;               elements in a[] and real elements in b[] is 'N'.
;
;       void cphase(complex <float> a[], float b[], int N);
;               counterpart to magnitude spectrum above, this
;               returns the phase of complex vector a[] in
;               real vector b[].
;
;       void fftscale(float a[], int N);
;       void cfftscale(complex <float> a[], int N);
;               divides each element of a[] by length of sequence 'N'.
;               used to correct amplitude of inverse fft.
;
;       void fftflip(float a[],int N);
;       void cfftflip(complex <float> a[],int N);
;               flips a complex vector so that the upper and lower
;               halves are exchanged. used after convolutions to
;               correct for failure to provide impulse function
;               in 'wrap-around' order.
;
        .586
        .model flat, C
        locals
	.data
twopi   dd      6.283185307179586
half    dd      0.500000000000000
mtwo    dd     -2.000000000000000

farg    dd      0.0
        dd      1.0
        dd      7.0710678118654752e-1
        dd      3.8268343236508977e-1
        dd      1.9509032201612827e-1
        dd      9.8017140329560602e-2
        dd      4.9067674327418014e-2
        dd      2.4541228522912288e-2
        dd      1.2271538285719926e-2
        dd      6.1358846491544754e-3
        dd      3.0679567629659763e-3
        dd      1.5339801862847656e-3
        dd      7.6699031874270453e-4
        dd      3.8349518757139559e-4
        dd      1.9174759731070331e-4
        dd      9.5873799095977346e-5
        dd      4.7936899603066885e-5
        dd      2.3968449808418219e-5
        dd      1.1984224905069706e-5
        dd      5.9921124526424278e-6
        dd      2.9960562263346608e-6
fargm   dd      0.0
        dd      -1.0
        dd      -7.0710678118654752e-1
        dd      -3.8268343236508977e-1
        dd      -1.9509032201612827e-1
        dd      -9.8017140329560602e-2
        dd      -4.9067674327418014e-2
        dd      -2.4541228522912288e-2
        dd      -1.2271538285719926e-2
        dd      -6.1358846491544754e-3
        dd      -3.0679567629659763e-3
        dd      -1.5339801862847656e-3
        dd      -7.6699031874270453e-4
        dd      -3.8349518757139559e-4
        dd      -1.9174759731070331e-4
        dd      -9.5873799095977346e-5
        dd      -4.7936899603066885e-5
        dd      -2.3968449808418219e-5
        dd      -1.1984224905069706e-5
        dd      -5.9921124526424278e-6
        dd      -2.9960562263346608e-6

	.code
        public sfft,csfft,cmulvec,cjmulvec,cdivvec,cmag,cphase
        public cfftflip, cmvec,csfft2,cfftscale,cjmvec,fftscale
        public fftflip,cmvec2,cjmvec2,circconv,circcorr
        public rectpol,polrect
;
;	brswap :
;
;	exchanges pairs of array elements which correspond to
;	reversed (flipped) binary indexes.
;
brswap  proc
    arg adata:ptr,log2:dword
    uses ebx,esi,edi
	

    mov ebx,adata    ;initialize array pointer
	mov edx,1
    mov ecx,log2
    shl edx,cl
    sub edx,1        ;decrement to n-1
brsl0:
    mov ecx,log2     ;loop counter
	mov esi,edx
    mov eax,edx
	xor edi,edi
brsl:
    shr eax,1                ;do bit reverse
	rcl edi,1
	loop brsl

	cmp edi,esi		;skip load and store to same or higher entry
	jbe brsl2	
	shl edi,1		;double index to account for complex numbers
    shl esi,1               ; (1 real float and 1 imaginary float)
    push dword ptr ebx[esi*4]
    push dword ptr ebx[edi*4]
    pop dword ptr ebx[esi*4]
    pop dword ptr ebx[edi*4]
    push dword ptr ebx[esi*4+4]
    push dword ptr ebx[edi*4+4]
    pop dword ptr ebx[esi*4+4]
    pop dword ptr ebx[edi*4+4]
;
brsl2:
    dec edx
	jne brsl0	;loop until next index is zero, then exit
	ret
brswap	endp

;
;       csfft :
;
;       performs single precision decimation in time fft
;
;       'C' calling conventions:
;
;               void csfft(complex <float> a[], int n, int isign);
;
;       where 'a[]' is an array of complex single precision values,
;       'n' is the order of the fft, 'isign' is '1' for a forward
;       fft and '-1' for an inverse fft. The type 'int' is 32-bits.
;
;       Since C++ permits overloading functions according to different
;       parameter lists, this can also be declared and called as:
;
;               void sfft(float a[], int n, int isign);
;
;       where the length of 'a[]' is 2*n, and the elements consist
;       of the alternating real and imaginary parts of a complex
;       vector.
;
;       To facilitate the use of the same routine with either calling
;       convention, the code block is nested with one label for
;       the outer block and the other for the inner block.
;
sfft   proc
csfft  proc
    arg adata:ptr,nn:dword,isign:dword
    local log2:dword,istep:dword,mmax:dword
    local wpr:dword,wpi:dword,wtemp:dword
    uses ebx,esi,edi

; Use bitscan to calculate power of 2
    mov eax,nn
    bsf ebx,eax
    mov log2,ebx

    shl nn,1        ;convert number of samples to number of floats
    call brswap,adata,log2
    mov mmax,2
    mov ebx,adata
oloop:
    mov eax,mmax
    cmp eax,nn
    jae fftx
    shl eax,1           ; 0    1    2    3    4    5    6    7
    mov istep,eax       ; =====================================
	fld twopi			; 2pi
	fild mmax			; mm   2pi
	fimul isign			; ..   2pi
	fdiv				; th
	fld st(0)			; th   th
	fmul half			; ..   th
	fsin				; wtm  th
	fst wtemp			; wtm  th
	fmul st(0),st(0)	; wt2  th
	fmul mtwo			; wpr  th
	fstp wpr			; th
	fsin				; wpi
	fstp wpi			;
	fld1				; wr
	fldz				; wi  wr
    mov ecx,0       ;ecx=m
iloop1:
    mov esi,ecx     ;esi=i
iloop2:
    mov edi,esi
    add edi,mmax    ;edi=j
    fld dword ptr ebx[edi*4]        ; d[j] wi   wr
    fmul st(0),st(2)                ; ..   wi   wr
    fld dword ptr ebx[edi*4+4]      ; d[j+]..   wi   wr
    fmul st(0),st(2)                ; ..   ..   wi   wr
    fsub                            ; tpr  wi   wr
    fld dword ptr ebx[edi*4]        ; d[j] tpr  wi   wr
    fmul st(0),st(2)                ; ..   tpr  wi   wr
    fld dword ptr ebx[edi*4+4]      ; d[j+]..   tpr  wi  wr
    fmul st(0),st(4)                ; ..   ..   tpr  wi  wr
    fadd                            ; tpi  tpr  wi   wr
    fld dword ptr ebx[esi*4]        ; d[i] tpi  tpr  wi  wr
    fsub st(0),st(2)                ; ..   tpi  tpr  wi  wr
    fstp dword ptr ebx[edi*4]       ; tpi  tpr  wi   wr
    fld dword ptr ebx[esi*4+4]      ; d[i+]tpi  tpr  wi  wr
    fsub st(0),st(1)                ; ..   tpi  tpr  wi  wr
    fstp dword ptr ebx[edi*4+4]     ; tpi  tpr  wi   wr
    fld dword ptr ebx[esi*4+4]      ; d[i+]tpi  tpr  wi   wr
    fadd                            ; ..   tpr  wi   wr
    fstp dword ptr ebx[esi*4+4]     ; tpr  wi   wr
    fld dword ptr ebx[esi*4]        ; d[i] tpr  wi  wr
	fadd 				; ..   wi   wr
    fstp dword ptr ebx[esi*4]       ; wi   wr

    add esi,istep
    cmp esi,nn
	jb iloop2

	fxch				; wr   wi
    fst dword ptr wtemp ; wr   wi
	fld st(0)			; wr   wr   wi
	fmul wpr			; ..   wr   wi
	fld st(2)			; wi   ..   wr   wi
	fmul wpi			; ..   ..   wr   wi
	fsub				; ..  wr   wi
	fadd				; wr' wi
	fxch				; wi   wr'
	fld st(0)			; wi   wi   wr'
	fmul wpr			; ..   wi   wr'
    fld dword ptr wtemp ; wtp  ..   wi  wr'
	fmul wpi			; ..   ..   wi  wr'
	fadd				; ..   wi   wr'
	fadd				; wi'  wr'

    add ecx,2
    mov edx,mmax
    dec edx
    cmp ecx,edx
	jb iloop1

	fstp st(0)			; wr'
	fstp st(0)			;
    mov eax,istep
    mov mmax,eax
	jmp oloop
fftx:
	ret
csfft   endp
sfft    endp

;
;   csfft1 -- same as csfft, except uses lookup table
;           for sines instead of 'fsin'.

csfft2   proc
    arg adata:ptr,nn:dword,isign:dword
    local log2:dword,istep:dword,mmax:dword
    local wpr:dword,wpi:dword,wtemp:dword,argtbl:dword
    uses ebx,esi,edi

; Use bitscan to calculate power of 2
    mov eax,nn
    bsf ebx,eax
    mov log2,ebx

    shl nn,1        ;convert number of samples to number of floats
    call brswap,adata,log2
	mov mmax,2
    mov ebx,adata
    lea ecx,farg
    mov eax,isign
    cmp eax,1
    je fwd
    lea ecx,fargm
fwd:
    mov argtbl,ecx
oloop2:
    mov eax,mmax
    cmp eax,nn
    jae fftx2
    shl eax,1                        ; 0    1    2    3    4    5    6    7
    mov istep,eax                    ; =====================================
    mov ecx,argtbl
    fld dword ptr [ecx+4]            ;st1
    fst wtemp
    fmul st(0),st(0)
    fmul mtwo
    fstp wpr
    fld dword ptr [ecx]
    fstp wpi
    add argtbl,4
	fld1				; wr
	fldz				; wi  wr
    mov ecx,0        ;cx=m
iloop12:
    mov esi,ecx       ;si=i
iloop22:
    mov edi,esi
    add edi,mmax     ;di=j
    fld dword ptr ebx[edi*4]        ; d[j] wi   wr
    fmul st(0),st(2)                ; ..   wi   wr
    fld dword ptr ebx[edi*4+4]      ; d[j+]..   wi   wr
    fmul st(0),st(2)                ; ..   ..   wi   wr
    fsub                            ; tpr  wi   wr
    fld dword ptr ebx[edi*4]        ; d[j] tpr  wi   wr
    fmul st(0),st(2)                ; ..   tpr  wi   wr
    fld dword ptr ebx[edi*4+4]      ; d[j+]..   tpr  wi  wr
    fmul st(0),st(4)                ; ..   ..   tpr  wi  wr
    fadd                            ; tpi  tpr  wi   wr
    fld dword ptr ebx[esi*4]        ; d[i] tpi  tpr  wi  wr
    fsub st(0),st(2)                ; ..   tpi  tpr  wi  wr
    fstp dword ptr ebx[edi*4]       ; tpi  tpr  wi   wr
    fld dword ptr ebx[esi*4+4]      ; d[i+]tpi  tpr  wi  wr
    fsub st(0),st(1)                ; ..   tpi  tpr  wi  wr
    fstp dword ptr ebx[edi*4+4]     ; tpi  tpr  wi   wr
    fld dword ptr ebx[esi*4+4]      ; d[i+]tpi  tpr  wi   wr
    fadd                            ; ..   tpr  wi   wr
    fstp dword ptr ebx[esi*4+4]     ; tpr  wi   wr
    fld dword ptr ebx[esi*4]        ; d[i] tpr  wi  wr
    fadd                            ; ..   wi   wr
    fstp dword ptr ebx[esi*4]       ; wi   wr

    add esi,istep
    cmp esi,nn
    jb iloop22

    fxch                            ; wr   wi
    fst dword ptr wtemp             ; wr   wi
    fld st(0)                       ; wr   wr   wi
    fmul wpr                        ; ..   wr   wi
    fld st(2)                       ; wi   ..   wr   wi
    fmul wpi                        ; ..   ..   wr   wi
    fsub                            ; ..  wr   wi
    fadd                            ; wr' wi
    fxch                            ; wi   wr'
    fld st(0)                       ; wi   wi   wr'
    fmul wpr                        ; ..   wi   wr'
    fld dword ptr wtemp             ; wtp  ..   wi  wr'
    fmul wpi                        ; ..   ..   wi  wr'
    fadd                            ; ..   wi   wr'
    fadd                            ; wi'  wr'

    add ecx,2
    mov edx,mmax
    dec edx
    cmp ecx,edx
    jb iloop12

    fstp st(0)                      ; wr'
    fstp st(0)                      ;
    mov eax,istep
    mov mmax,eax
    jmp oloop2
fftx2:
	ret
csfft2    endp

;
;       cmvec
;
;       Multiply corresponding elements of two equal-length complex
;       vectors. Return result in first vector.
;       (Used for circular convolutions.)
;
;       'C' calling conventions:
;
;       void cmvec(complex <float> a[], complex <float> b[], int n);
;
cmvec   proc
    arg a:ptr, b:ptr, n:dword
    uses ebx,edi,esi

    mov ecx,n
    dec ecx
    mov edi,a
    mov esi,b
@@cmlp:
    fld dword ptr edi[ecx*8]          ;a
    fld st                            ;a  a
    fmul dword ptr esi[ecx*8]         ;a*c a
    fld dword ptr edi[ecx*8+4]        ;b a*c a
    fmul dword ptr esi[ecx*8+4]       ;b*d a*c a
    fsub                              ;re  a
    fstp dword ptr edi[ecx*8]         ;a
    fmul dword ptr esi[ecx*8+4]       ;a*d
    fld dword ptr edi[ecx*8+4]        ;b a*d
    fmul dword ptr esi[ecx*8]         ;c*b a*d
    fadd                              ;im
    fstp dword ptr edi[ecx*8+4]       ;
    dec ecx
    jns @@cmlp
    ret
cmvec   endp
;
;       cmvec2
;
;       Multiply corresponding elements of two equal-length complex
;       vectors. Return result in first vector.
;       (Used for circular convolutions.)
;       This version allows same vector in 'a' and 'b'.
;
;       'C' calling conventions:
;
;       void cmvec2(complex <float> a[], complex <float> b[], int n);
;
cmvec2  proc
    arg a:ptr, b:ptr, n:dword
    uses ebx,edi,esi

    mov ecx,n
    dec ecx
    mov edi,a
    mov esi,b
@@cmlp:
    fld dword ptr edi[ecx*8]          ;a
    fmul dword ptr esi[ecx*8]         ;a*c
    fld dword ptr edi[ecx*8+4]        ;b a*c
    fmul dword ptr esi[ecx*8+4]       ;b*d a*c
    fsub                              ;re
    fld dword ptr edi[ecx*8]          ;a re
    fmul dword ptr esi[ecx*8+4]       ;a*d re
    fld dword ptr edi[ecx*8+4]        ;b a*d re
    fmul dword ptr esi[ecx*8]         ;c*b a*d re
    fadd                              ;im re
    fstp dword ptr edi[ecx*8+4]       ;re
    fstp dword ptr edi[ecx*8]         ;
    dec ecx
    jns @@cmlp
    ret
cmvec2   endp
;
;       cjmvec
;
;       Multiply corresponding elements of two equal-length complex
;       vectors using conjugate of second vector. Return result in
;       first vector.
;       (Used for circular correlations.)
;
;       'C' calling conventions:
;
;       void cjmvec(complex<float> a[], complex<float> b[], int n);
;
cjmvec   proc
    arg a:ptr, b:ptr, n:dword
    uses ebx,edi,esi

    mov ecx,n
    dec ecx
    mov edi,a
    mov esi,b
@@cjmlp:
    fld dword ptr edi[ecx*8]          ;a
    fld st                            ;a  a
    fmul dword ptr esi[ecx*8]         ;a*c a
    fld dword ptr edi[ecx*8+4]        ;b a*c a
    fmul dword ptr esi[ecx*8+4]       ;b*d a*c a
    fadd                              ;re  a
    fstp dword ptr edi[ecx*8]         ;a
    fmul dword ptr esi[ecx*8+4]       ;a*d
    fld dword ptr edi[ecx*8+4]        ;b a*d
    fmul dword ptr esi[ecx*8]         ;c*b a*d
    fsubr                             ;im
    fstp dword ptr edi[ecx*8+4]       ;
    dec ecx
    jns @@cjmlp
    ret
cjmvec   endp
;
;       cjmvec2
;
;       Multiply corresponding elements of two equal-length complex
;       vectors using conjugate of second vector. Return result in
;       first vector.
;       (Used for circular correlations.)
;
;       'C' calling conventions:
;
;       void cjmvec2(complex<float> a[], complex<float> b[], int n);
;
cjmvec2   proc
    arg a:ptr, b:ptr, n:dword
    uses ebx,edi,esi

    mov ecx,n
    dec ecx
    mov edi,a
    mov esi,b
@@cjmlp:
    fld dword ptr edi[ecx*8]          ;a
    fmul dword ptr esi[ecx*8]         ;a*c
    fld dword ptr edi[ecx*8+4]        ;b a*c
    fmul dword ptr esi[ecx*8+4]       ;b*d a*c
    fadd                              ;re
    fld dword ptr edi[ecx*8]          ;a re
    fmul dword ptr esi[ecx*8+4]       ;a*d re
    fld dword ptr edi[ecx*8+4]        ;b a*d re
    fmul dword ptr esi[ecx*8]         ;c*b a*d re
    fsubr                             ;im re
    fstp dword ptr edi[ecx*8+4]       ;re
    fstp dword ptr edi[ecx*8]         ;
    dec ecx
    jns @@cjmlp
    ret
cjmvec2   endp

;
;       cmulvec
;
;       Multiply corresponding elements of two equal-length complex
;       vectors. Return result in first vector.
;       (Used for circular convolutions.)
;
;       'C' calling conventions:
;
;       void cmulvec(complex<float> a[], complex<float> b[], int n);
;

cmulvec proc
    arg a:ptr, b:ptr, n:dword
    uses ebx,esi,edi

    mov ecx,n
    dec ecx
    mov edi,a
    mov esi,b
cmloop:
    fld dword ptr edi[ecx*8]        ;a
    fld dword ptr edi[ecx*8+4]      ;b    a
    fld dword ptr esi[ecx*8]        ;c    b    a
    fld dword ptr esi[ecx*8+4]      ;d    c    b    a
    fld st(0)                       ;d    d    c    b    a
    fmul st(0),st(3)                ;bd   d    c    b    a
    fld st(2)                       ;c    bd   d    c    b    a
    fmul st(0),st(5)                ;ac   bd   d    c    b    a
    fld st(0)                       ;ac   ac   bd   d    c    b    a
    fsub st(0),st(2)                ;re   ac   bd   d    c    b    a
    fstp dword ptr edi[ecx*8]       ;ac   bd   d    c    b    a
    fadd                            ;..   d    c    b    a
    fxch st(4)                      ;a    d    c    b    ..
    faddp st(3),st(0)               ;d    c    a+b  ..
    fadd                            ;c+d  a+b  ..
    fmul                            ;..   ..
    fsubr                           ;im
    fstp dword ptr edi[ecx*8+4]     ;

    dec ecx
	jns cmloop
	ret
cmulvec endp

;
;       cjmulvec
;
;       Multiply corresponding elements of two equal-length complex
;       vectors using the complex conjugate of the second vector.
;       Return result in first vector.
;       (Used for circular correlations.)
;
;       'C' calling conventions:
;
;       void cjmulvec(complex<float> a[], complex<float> b[], int n);
;
cjmulvec proc
    arg a:ptr, b:ptr, n:dword
    uses ebx,esi,edi

    mov ecx,n
    dec ecx
    mov edi,a
    mov esi,b
cjmloop:
    fld dword ptr edi[ecx*8]        ;a
    fld dword ptr edi[ecx*8+4]      ;b    a
    fld dword ptr esi[ecx*8]        ;c    b    a
    fld dword ptr esi[ecx*8+4]      ;d    c    b    a
    fchs                            ;d    c    b    a
    fld st(0)                       ;d    d    c    b    a
    fmul st(0),st(3)                ;bd   d    c    b    a
    fld st(2)                       ;c    bd   d    c    b    a
    fmul st(0),st(5)                ;ac   bd   d    c    b    a
    fld st(0)                       ;ac   ac   bd   d    c    b    a
    fsub st(0),st(2)                ;re   ac   bd   d    c    b    a
    fstp dword ptr edi[ecx*8]       ;ac   bd   d    c    b    a
    fadd                            ;..   d    c    b    a
    fxch st(4)                      ;a    d    c    b    ..
    faddp st(3),st(0)               ;d    c    a+b  ..
    fadd                            ;c+d  a+b  ..
    fmul                            ;..   ..
    fsubr                           ;im
    fstp dword ptr edi[ecx*8+4]     ;

    dec ecx
    jns cjmloop
	ret
cjmulvec endp

;
;       cdivvec
;
;       Divide the elements of one complex vector by the corresponding
;       elements of another. Returns zero of there are no
;       divides-by-zero and a positive number (count) otherwise.
;
;       'C' calling conventions:
;
;               int cdivvec(float *a, float *b, int n);
;       or
;
;               int cdivvec(complex<float> *a,complex<float> *b, int n);


cdivvec proc
    arg a:ptr, b:ptr, n:dword
    uses ebx,esi,edi

    mov ecx,n
    dec ecx
    mov edi,a
    mov esi,b
    mov edx,0        ; edx will be non-zero if any divide-by-zero occurs
cdloop:
    fld dword ptr edi[ecx*8]        ; a
    fld dword ptr edi[ecx*8+4]      ; b    a
    fld dword ptr esi[ecx*8]        ; c    b    a
    fld dword ptr esi[ecx*8+4]      ; d    c    b    a
    fld st(0)                       ; d    d    c    b    a
    fmul st(0),st(0)                ; d2   d    c    b    a
    fld st(2)                       ; c    d2   d    c    b    a
    fmul st(0),st(0)                ; c2   d2   d    c    b    a
    fadd                            ; den  d    c    b    a
    fldz                            ; 0    den  d    c    b    a
    fcomp                           ; den  d    c    b    a
	fstsw ax
	sahf
	jnz cd1		;check for zero divisor
	fstp st(0)
	fld1
    inc edx
cd1:
    fld st(4)                       ; a    den  d    c    b    a
    fmul st(0),st(3)                ; ac   den  d    c    b    a
    fld st(2)                       ; d    ac   den  d    c    b    a
    fmul st(0),st(5)                ; bd   ac   den  d    c    b    a
    fadd                            ; ..   den  d    c    b    a
    fdiv st(0),st(1)                ; re   den  d    c    b    a
    fstp dword ptr edi[ecx*8]       ; den  d    c    b    a
    fxch st(4)                      ; a    d    c    b    den
    fmul                            ; ..   c    b    den
    fxch st(2)                      ; b    c    ..   den
    fmul                            ; ..   ..   den
    fsubr                           ; ..   den
    fdivr                           ; im
    fstp dword ptr edi[ecx*8+4]     ;
    dec ecx
	jns cdloop
    mov eax,edx       ; return eax=0 if OK, eax=number of divide-by-0s
	ret
cdivvec endp

;
;       cmag
;
;       'C' calling conventions:
;
;               void cmag(complex<float> a[], float b[], int n);
;
;       where 'a[]' is the complex signal vector of length 'n', and
;       'b[]' is the real magnitude spectrum, also of length 'n'.
;       The type 'int' is 32-bits.
;
cmag    proc
    arg a:ptr,b:ptr,n:dword
    uses ebx,esi, edi

    mov ecx,n
    mov esi,a
    mov edi,b
    dec ecx
@@mloop:
    fld dword ptr esi[ecx*8]
    fmul st(0),st(0)
    fld dword ptr esi[ecx*8+4]
    fmul st(0),st(0)
    fadd
    fsqrt
    fstp dword ptr edi[ecx*4]
    dec ecx
    jns @@mloop
    ret
cmag    endp
;
;       cphase
;
;       'C' calling conventions:
;
;               void cphase(complex<float> a[], float b[], int n);
;
cphase proc
    arg a:ptr,b:ptr,n:dword
    uses ebx,esi, edi

    mov ecx,n
    mov esi,a
    mov edi,b
    dec ecx
@@mloop:
    fld dword ptr esi[ecx*8+4]      ;imaginary ('y') part
    fld dword ptr esi[ecx*8]        ;real ('x') part
    fpatan
    fstp dword ptr edi[ecx*4]
    dec ecx
    jns @@mloop
    ret
cphase endp

;
;       cfftscale -- divides elements of vector a[], by number of
;                       elements in a[]. used to correct amplitude
;                       of inverse fft.
fftscale proc

cfftscale proc
    arg a:ptr,n:dword

    fld1
    fild n
    fdiv
    mov eax,a
    mov ecx,n
    shl ecx,1
    dec ecx
@@vloop:
    fld dword ptr [eax+4*ecx]
    fmul st,st(1)
    fstp dword ptr [eax+4*ecx]
    dec ecx
    jns @@vloop
    fstp st
    ret
cfftscale endp
fftscale endp

;
;       cfftflip
;
;       Exchange lower half and upper half of signal vector to
;       correct for positive/negative frequency layout in
;       convolution.
;
;       'C' calling conventions:
;
;               void cfftflip(complex <float> a[],int n);
;
;       where 'n' must be even -- (assured, if it's a power of 2!)
fftflip proc
cfftflip proc
    arg a:ptr,n:dword
    uses ebx,edi

    mov edi,n
    mov ecx,edi
    shr ecx,1
    shl edi,2
    mov ebx,a
fliploop:
    mov eax,[ebx]
    mov edx,[ebx+edi]
    mov [ebx+edi],eax
    mov [ebx],edx
    mov eax,[ebx+4]
    mov edx,[ebx+edi+4]
    mov [ebx+edi+4],eax
    mov [ebx+4],edx
    add ebx,8
    dec ecx
    jne fliploop
    ret
cfftflip endp
fftflip endp

circconv proc
    arg a:ptr,b:ptr,n:dword
    uses esi,edi,ebx

    call csfft2, a,n,1
    mov eax,a
    cmp eax,b
    je @@ccc1
    call csfft2, b,n,1
@@ccc1:
    call cmvec2, a,b,n
    call csfft2, a,n,-1
    call fftscale, a,n
    ret
circconv  endp
;
;   circcorr -- circular correlation
;
circcorr proc
    arg a:ptr,b:ptr,n:dword
    uses esi,edi,ebx

    call csfft2, a,n,1
    mov eax,a
    cmp eax,b
    je @@ccc1
    call csfft2, b,n,1
@@ccc1:
    call cjmvec2, a,b,n
    call csfft2, a,n,-1
    call fftscale, a,n
    ret
circcorr  endp
;
;   rectpol -- rectangular to polar conversion
;
;   Converts rectangular format vector (re,im) in 'src'
;   to polar form (mag,arg) in 'dest.
;
;   NOTE: 'src' and 'dest' may be the same vector.
;
rectpol proc
    arg src:ptr,dest:ptr,n:dword
    uses esi,edi,ebx

    mov esi,src
    mov edi,dest
    mov ecx,n
    dec ecx

rplp:
    fld dword ptr [esi+8*ecx+4]
    fld dword ptr [esi+8*ecx]
    fld st(0)
    fmul st,st(0)
    fld st(2)
    fmul st,st(0)
    fadd
    fsqrt
    fstp dword ptr [edi+8*ecx]
    fpatan
    fstp dword ptr [edi+8*ecx+4]
    dec ecx
    jns rplp
    ret
rectpol endp
;
;   polrect -- polar to rectangular conversion
;
;   Converts polar format vector (mag,arg) in 'src'
;   to rectangular form (re,im) in 'dest'.
;
;   NOTE: 'src' and 'dest' may be the same vector.
;
polrect proc
    arg src:ptr,dest:ptr,n:dword
    uses esi,edi,ebx

    mov esi,src
    mov edi,dest
    mov ecx,n
    dec ecx

prlp:
    fld dword ptr [esi+8*ecx+4]
    fsincos
    fxch
    fmul dword ptr [esi+8*ecx]
    fstp dword ptr [edi+8*ecx+4]
    fmul dword ptr [esi+8*ecx]
    fstp dword ptr [edi+8*ecx]
    dec ecx
    jns prlp
    ret
polrect endp
        end

