/* dct64_sse: MMX/SSE optimized dct64 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 see COPYING and AUTHORS files in distribution or http://mpg123.org initially written by Taihei Monma */ #include "mangle.h" #define ARG(n) (8+n*4)(%ebp) #define TEMP(n) (4+n*16)(%esp) #define TEMP_BYTE(n) (4+n)(%esp) /* void dct64_sse(short *out0, short *out1, real *samples); */ #if !defined (__APPLE__) && !defined (__OS2__) .section .rodata #else .data #endif ALIGN16 pnpn: .long 0 .long -2147483648 .long 0 .long -2147483648 ALIGN16 mask: .long -1 .long -1 .long -1 .long 0 .text ALIGN16 .globl ASM_NAME(dct64_sse) ASM_NAME(dct64_sse): pushl %ebp movl %esp, %ebp andl $-16, %esp /* align the stack at 16 bytes */ subl $128, %esp /* reserve space for temporal store */ pushl %ebx GET_GOT movl ARG(2), %eax MOVUAPS (%eax), %xmm7 MOVUAPS 16(%eax), %xmm6 MOVUAPS 112(%eax), %xmm0 MOVUAPS 96(%eax), %xmm1 shufps $0x1b, %xmm0, %xmm0 shufps $0x1b, %xmm1, %xmm1 movaps %xmm7, %xmm4 movaps %xmm6, %xmm5 addps %xmm0, %xmm4 addps %xmm1, %xmm5 subps %xmm0, %xmm7 subps %xmm1, %xmm6 movaps %xmm4, TEMP(0) movaps %xmm5, TEMP(1) MOVUAPS 32(%eax), %xmm2 MOVUAPS 48(%eax), %xmm3 MOVUAPS 80(%eax), %xmm0 MOVUAPS 64(%eax), %xmm1 shufps $0x1b, %xmm0, %xmm0 shufps $0x1b, %xmm1, %xmm1 movaps %xmm2, %xmm5 movaps %xmm3, %xmm4 addps %xmm0, %xmm2 addps %xmm1, %xmm3 subps %xmm0, %xmm5 subps %xmm1, %xmm4 #if defined(PIC) && defined(__APPLE__) mov GLOBAL_VAR_PTR(costab_mmxsse), %ecx #else lea GLOBAL_VAR(costab_mmxsse), %ecx #endif mulps (%ecx), %xmm7 mulps 16(%ecx), %xmm6 mulps 32(%ecx), %xmm5 mulps 48(%ecx), %xmm4 shufps $0x1b, %xmm2, %xmm2 shufps $0x1b, %xmm3, %xmm3 shufps $0x1b, %xmm4, %xmm4 shufps $0x1b, %xmm5, %xmm5 movaps TEMP(0), %xmm0 movaps TEMP(1), %xmm1 subps %xmm3, %xmm0 subps %xmm2, %xmm1 addps TEMP(0), %xmm3 addps TEMP(1), %xmm2 movaps %xmm3, TEMP(0) movaps %xmm2, TEMP(1) movaps %xmm6, %xmm2 movaps %xmm7, %xmm3 subps %xmm5, %xmm6 subps %xmm4, %xmm7 addps %xmm3, %xmm4 addps %xmm2, %xmm5 mulps 64(%ecx), %xmm0 mulps 80(%ecx), %xmm1 mulps 80(%ecx), %xmm6 mulps 64(%ecx), %xmm7 movaps TEMP(0), %xmm2 movaps TEMP(1), %xmm3 shufps $0x1b, %xmm3, %xmm3 shufps $0x1b, %xmm5, %xmm5 shufps $0x1b, %xmm1, %xmm1 shufps $0x1b, %xmm6, %xmm6 movaps %xmm0, TEMP(1) subps %xmm3, %xmm2 subps %xmm1, %xmm0 addps TEMP(0), %xmm3 addps TEMP(1), %xmm1 movaps %xmm3, TEMP(0) movaps %xmm1, TEMP(2) movaps %xmm5, %xmm1 movaps %xmm4, %xmm5 movaps %xmm7, %xmm3 subps %xmm1, %xmm5 subps %xmm6, %xmm7 addps %xmm1, %xmm4 addps %xmm3, %xmm6 mulps 96(%ecx), %xmm2 mulps 96(%ecx), %xmm0 mulps 96(%ecx), %xmm5 mulps 96(%ecx), %xmm7 movaps %xmm2, TEMP(1) movaps %xmm0, TEMP(3) movaps %xmm4, %xmm2 movaps %xmm5, %xmm3 shufps $0x44, %xmm6, %xmm2 shufps $0xbb, %xmm7, %xmm5 shufps $0xbb, %xmm6, %xmm4 shufps $0x44, %xmm7, %xmm3 movaps %xmm2, %xmm6 movaps %xmm3, %xmm7 subps %xmm4, %xmm2 subps %xmm5, %xmm3 addps %xmm6, %xmm4 addps %xmm7, %xmm5 movaps 112(%ecx), %xmm0 movlhps %xmm0, %xmm0 mulps %xmm0, %xmm2 mulps %xmm0, %xmm3 movaps %xmm0, TEMP(4) movaps %xmm4, %xmm6 movaps %xmm5, %xmm7 shufps $0x14, %xmm2, %xmm4 shufps $0xbe, %xmm2, %xmm6 shufps $0x14, %xmm3, %xmm5 shufps $0xbe, %xmm3, %xmm7 movaps %xmm5, TEMP(5) movaps %xmm7, TEMP(7) movaps TEMP(0), %xmm0 movaps TEMP(1), %xmm1 movaps %xmm0, %xmm2 movaps %xmm1, %xmm3 shufps $0x44, TEMP(2), %xmm2 shufps $0xbb, TEMP(3), %xmm1 shufps $0xbb, TEMP(2), %xmm0 shufps $0x44, TEMP(3), %xmm3 movaps %xmm2, %xmm5 movaps %xmm3, %xmm7 subps %xmm0, %xmm2 subps %xmm1, %xmm3 addps %xmm5, %xmm0 addps %xmm7, %xmm1 mulps TEMP(4), %xmm2 mulps TEMP(4), %xmm3 movaps %xmm0, %xmm5 movaps %xmm1, %xmm7 shufps $0x14, %xmm2, %xmm0 shufps $0xbe, %xmm2, %xmm5 shufps $0x14, %xmm3, %xmm1 shufps $0xbe, %xmm3, %xmm7 movaps %xmm0, TEMP(0) movaps %xmm1, TEMP(1) movaps %xmm5, TEMP(2) movaps %xmm7, TEMP(3) movss 120(%ecx), %xmm5 shufps $0x00, %xmm5, %xmm5 xorps LOCAL_VAR(pnpn), %xmm5 movaps %xmm4, %xmm0 movaps %xmm6, %xmm1 unpcklps TEMP(5), %xmm4 unpckhps TEMP(5), %xmm0 unpcklps TEMP(7), %xmm6 unpckhps TEMP(7), %xmm1 movaps %xmm4, %xmm2 movaps %xmm6, %xmm3 unpcklps %xmm0, %xmm4 unpckhps %xmm0, %xmm2 unpcklps %xmm1, %xmm6 unpckhps %xmm1, %xmm3 movaps %xmm4, %xmm0 movaps %xmm6, %xmm1 subps %xmm2, %xmm0 subps %xmm3, %xmm1 addps %xmm2, %xmm4 addps %xmm3, %xmm6 mulps %xmm5, %xmm0 mulps %xmm5, %xmm1 movaps %xmm5, TEMP(5) movaps %xmm4, %xmm5 movaps %xmm6, %xmm7 unpcklps %xmm0, %xmm4 unpckhps %xmm0, %xmm5 unpcklps %xmm1, %xmm6 unpckhps %xmm1, %xmm7 movaps TEMP(0), %xmm0 movaps TEMP(2), %xmm2 movaps %xmm4, TEMP(4) movaps %xmm6, TEMP(6) movaps %xmm0, %xmm4 movaps %xmm2, %xmm6 unpcklps TEMP(1), %xmm0 unpckhps TEMP(1), %xmm4 unpcklps TEMP(3), %xmm2 unpckhps TEMP(3), %xmm6 movaps %xmm0, %xmm1 movaps %xmm2, %xmm3 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm1 unpcklps %xmm6, %xmm2 unpckhps %xmm6, %xmm3 movaps %xmm0, %xmm4 movaps %xmm2, %xmm6 subps %xmm1, %xmm4 subps %xmm3, %xmm6 addps %xmm1, %xmm0 addps %xmm3, %xmm2 mulps TEMP(5), %xmm4 mulps TEMP(5), %xmm6 movaps %xmm0, %xmm1 movaps %xmm2, %xmm3 unpcklps %xmm4, %xmm0 unpckhps %xmm4, %xmm1 unpcklps %xmm6, %xmm2 unpckhps %xmm6, %xmm3 movaps %xmm0, TEMP(0) movaps %xmm1, TEMP(1) movaps %xmm2, TEMP(2) movaps %xmm3, TEMP(3) movaps %xmm5, TEMP(5) movaps %xmm7, TEMP(7) movss TEMP_BYTE(12), %xmm0 movss TEMP_BYTE(28), %xmm1 movss TEMP_BYTE(44), %xmm2 movss TEMP_BYTE(60), %xmm3 addss TEMP_BYTE(8), %xmm0 addss TEMP_BYTE(24), %xmm1 addss TEMP_BYTE(40), %xmm2 addss TEMP_BYTE(56), %xmm3 movss %xmm0, TEMP_BYTE(8) movss %xmm1, TEMP_BYTE(24) movss %xmm2, TEMP_BYTE(40) movss %xmm3, TEMP_BYTE(56) movss TEMP_BYTE(76), %xmm0 movss TEMP_BYTE(92), %xmm1 movss TEMP_BYTE(108), %xmm2 movss TEMP_BYTE(124), %xmm3 addss TEMP_BYTE(72), %xmm0 addss TEMP_BYTE(88), %xmm1 addss TEMP_BYTE(104), %xmm2 addss TEMP_BYTE(120), %xmm3 movss %xmm0, TEMP_BYTE(72) movss %xmm1, TEMP_BYTE(88) movss %xmm2, TEMP_BYTE(104) movss %xmm3, TEMP_BYTE(120) movaps TEMP_BYTE(16), %xmm1 movaps TEMP_BYTE(48), %xmm3 movaps TEMP_BYTE(80), %xmm5 movaps TEMP_BYTE(112), %xmm7 movaps %xmm1, %xmm0 movaps %xmm3, %xmm2 movaps %xmm5, %xmm4 movaps %xmm7, %xmm6 shufps $0x1e, %xmm0, %xmm0 shufps $0x1e, %xmm2, %xmm2 shufps $0x1e, %xmm4, %xmm4 shufps $0x1e, %xmm6, %xmm6 leal LOCAL_VAR(mask), %ecx andps (%ecx), %xmm0 andps (%ecx), %xmm2 andps (%ecx), %xmm4 andps (%ecx), %xmm6 addps %xmm0, %xmm1 addps %xmm2, %xmm3 addps %xmm4, %xmm5 addps %xmm6, %xmm7 movaps TEMP_BYTE(32), %xmm2 movaps TEMP_BYTE(96), %xmm6 movaps %xmm2, %xmm0 movaps %xmm6, %xmm4 shufps $0x1e, %xmm0, %xmm0 shufps $0x1e, %xmm4, %xmm4 andps (%ecx), %xmm0 andps (%ecx), %xmm4 addps %xmm3, %xmm2 addps %xmm0, %xmm3 addps %xmm7, %xmm6 addps %xmm4, %xmm7 movaps TEMP_BYTE(0), %xmm0 movaps TEMP_BYTE(64), %xmm4 cvtps2pi %xmm0, %mm0 cvtps2pi %xmm1, %mm1 movhlps %xmm0, %xmm0 movhlps %xmm1, %xmm1 cvtps2pi %xmm0, %mm2 cvtps2pi %xmm1, %mm3 packssdw %mm2, %mm0 packssdw %mm3, %mm1 cvtps2pi %xmm2, %mm2 cvtps2pi %xmm3, %mm3 movhlps %xmm2, %xmm2 movhlps %xmm3, %xmm3 cvtps2pi %xmm2, %mm4 cvtps2pi %xmm3, %mm5 packssdw %mm4, %mm2 packssdw %mm5, %mm3 mov %ecx, TEMP_BYTE(0) movl ARG(0), %ecx movl ARG(1), %ebx movd %mm0, %eax movd %mm1, %edx movw %ax, 512(%ecx) movw %dx, 384(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, (%ecx) movw %ax, (%ebx) movw %dx, 128(%ebx) movd %mm2, %eax movd %mm3, %edx movw %ax, 448(%ecx) movw %dx, 320(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, 64(%ebx) movw %dx, 192(%ebx) psrlq $32, %mm0 psrlq $32, %mm1 movd %mm0, %eax movd %mm1, %edx movw %ax, 256(%ecx) movw %dx, 128(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, 256(%ebx) movw %dx, 384(%ebx) psrlq $32, %mm2 psrlq $32, %mm3 movd %mm2, %eax movd %mm3, %edx movw %ax, 192(%ecx) movw %dx, 64(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, 320(%ebx) movw %dx, 448(%ebx) mov TEMP_BYTE(0), %eax movaps %xmm4, %xmm0 shufps $0x1e, %xmm0, %xmm0 movaps %xmm5, %xmm1 andps (%eax), %xmm0 addps %xmm6, %xmm4 addps %xmm7, %xmm5 addps %xmm1, %xmm6 addps %xmm0, %xmm7 cvtps2pi %xmm4, %mm0 cvtps2pi %xmm5, %mm1 movhlps %xmm4, %xmm4 movhlps %xmm5, %xmm5 cvtps2pi %xmm4, %mm2 cvtps2pi %xmm5, %mm3 packssdw %mm2, %mm0 packssdw %mm3, %mm1 cvtps2pi %xmm6, %mm2 cvtps2pi %xmm7, %mm3 movhlps %xmm6, %xmm6 movhlps %xmm7, %xmm7 cvtps2pi %xmm6, %mm4 cvtps2pi %xmm7, %mm5 packssdw %mm4, %mm2 packssdw %mm5, %mm3 movd %mm0, %eax movd %mm2, %edx movw %ax, 480(%ecx) movw %dx, 416(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, 32(%ebx) movw %dx, 96(%ebx) psrlq $32, %mm0 psrlq $32, %mm2 movd %mm0, %eax movd %mm2, %edx movw %ax, 224(%ecx) movw %dx, 160(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, 288(%ebx) movw %dx, 352(%ebx) movd %mm1, %eax movd %mm3, %edx movw %ax, 352(%ecx) movw %dx, 288(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, 160(%ebx) movw %dx, 224(%ebx) psrlq $32, %mm1 psrlq $32, %mm3 movd %mm1, %eax movd %mm3, %edx movw %ax, 96(%ecx) movw %dx, 32(%ecx) shrl $16, %eax shrl $16, %edx movw %ax, 416(%ebx) movw %dx, 480(%ebx) popl %ebx movl %ebp, %esp popl %ebp ret #if defined(PIC) && defined(__APPLE__) .section __IMPORT,__pointers,non_lazy_symbol_pointers L_costab_mmxsse: .indirect_symbol ASM_NAME(costab_mmxsse) .long 0 #endif NONEXEC_STACK