Tuesday, July 22, 2008

Gcc auto-vectorization

Code: vect_auto.c

#include <stdio.h>
#include <string.h>

int main()
{
int i,j,k,l;

int a[4096];
int b[4096];

memset((void *)a,0,4096);
memset((void *)b,0,4096);

for(i=0;i<4096;i++)
{
a[i] =i%67;
b[i]= i%87;
}

for(k=0;k<200000;k++)
for(i=0;i<4096;i++)
a[i] = a[i]+b[i];

l=0;

for(j=0;j<4096;j++)
{l=l+a[i];}

// printf("%d\n",l);
return l;
}



$ gcc -O3 -ftree-vectorize vect_auto.c -o vect_o

disasm:


0x0000000000400530 : movdqa (%rax,%rbp,1),%xmm0
0x0000000000400535 : paddd (%rax,%rbx,1),%xmm0
0x000000000040053a : movdqa %xmm0,(%rax,%rbx,1)

0x000000000040053f : add $0x10,%rax
0x0000000000400543 : cmp $0x4000,%rax
0x0000000000400549 : jne 0x400530



$ gcc -O3 vect_auto.c -o vect_o_no
disasm:

0x0000000000400528 : xor %edx,%edx
0x000000000040052a : nopw 0x0(%rax,%rax,1)
0x0000000000400530 : mov 0x0(%rbp,%rdx,4),%eax
0x0000000000400534 : add %eax,(%rbx,%rdx,4)
0x0000000000400537 : add $0x1,%rdx
0x000000000040053b : cmp $0x1000,%rdx
0x0000000000400542 : jne 0x400530

0x0000000000400544 : add $0x1,%ecx
0x0000000000400547 : cmp $0x30d40,%ecx
0x000000000040054d : jne 0x400528

[rtg@rtgCent vect]$ time ./vect_o_no

real 0m2.178s
user 0m2.128s
sys 0m0.003s
[rtg@rtgCent vect]$ time ./vect_o

real 0m0.867s
user 0m0.835s
sys 0m0.001s

vector operations used in the second executable, but gain in time is not such a big.

No comments: