Code: vect_auto.c
#include <stdio.h>
#include <string.h>
int main()
{
int i,j,k,l;
int a[4096];
int b[4096];
memset((void *)a,0,4096);
memset((void *)b,0,4096);
for(i=0;i<4096;i++)
{
a[i] =i%67;
b[i]= i%87;
}
for(k=0;k<200000;k++)
for(i=0;i<4096;i++)
a[i] = a[i]+b[i];
l=0;
for(j=0;j<4096;j++)
{l=l+a[i];}
// printf("%d\n",l);
return l;
}
$ gcc -O3 -ftree-vectorize vect_auto.c -o vect_o
disasm:
0x0000000000400530: movdqa (%rax,%rbp,1),%xmm0
0x0000000000400535: paddd (%rax,%rbx,1),%xmm0
0x000000000040053a: movdqa %xmm0,(%rax,%rbx,1)
0x000000000040053f: add $0x10,%rax
0x0000000000400543: cmp $0x4000,%rax
0x0000000000400549: jne 0x400530
$ gcc -O3 vect_auto.c -o vect_o_no
disasm:
0x0000000000400528: xor %edx,%edx
0x000000000040052a: nopw 0x0(%rax,%rax,1)
0x0000000000400530: mov 0x0(%rbp,%rdx,4),%eax
0x0000000000400534: add %eax,(%rbx,%rdx,4)
0x0000000000400537: add $0x1,%rdx
0x000000000040053b: cmp $0x1000,%rdx
0x0000000000400542: jne 0x400530
0x0000000000400544: add $0x1,%ecx
0x0000000000400547: cmp $0x30d40,%ecx
0x000000000040054d: jne 0x400528
[rtg@rtgCent vect]$ time ./vect_o_no
real 0m2.178s
user 0m2.128s
sys 0m0.003s
[rtg@rtgCent vect]$ time ./vect_o
real 0m0.867s
user 0m0.835s
sys 0m0.001s
vector operations used in the second executable, but gain in time is not such a big.
No comments:
Post a Comment