visual studio自动向量化
阅读原文时间:2023年07月08日阅读:3

/////////////////////////////////////////////////
/*
SSE 和 AVX 每个都有16个寄存器
SSE 有 XMM0 ~ XMM15,是128bit
AVX 有 YMM0 ~ YMM15,是256bit
*/
// vs2015 有自动向量化功能,使用 c/c++->command line->additional options->/Qvec-report:2 可输出向量化结果
// 可自动向量化
void loop1()
{
int a[1024];
int b[1024];
int c[1024];
for (int i = 0; i < 1024; i++)
{
c[i] = a[i] + b[i];
}
}

/*
向量化结果
for (int i = 0; i < 1024; i++)
{
c[i] = a[i] + b[i];
}

vmovdqu ymm0, YMMWORD PTR _b$[ebp+eax]
vpaddd ymm0, ymm0, YMMWORD PTR _a$[ebp+eax]
vmovdqu YMMWORD PTR _c$[ebp+eax], ymm0
vmovdqu ymm0, YMMWORD PTR _a$[ebp+eax+32]
vpaddd ymm0, ymm0, YMMWORD PTR _b$[ebp+eax+32]
vmovdqu YMMWORD PTR _c$[ebp+eax+32], ymm0
add eax, 64 ; 00000040H
cmp eax, 4096 ; 00001000H
jl SHORT $LL4@main
*/

// 不可自动向量化(1301)
// 循环步长必须严格为1
void loop2()
{
int a[1024];
int b[1024];
int c[1024];
for (int i = 0; i < 1024; i += 4)
{
c[i] = a[i] + b[i];
c[i + 1] = a[i + 1] + b[i + 1];
c[i + 2] = a[i + 2] + b[i + 2];
c[i + 3] = a[i + 3] + b[i + 3];
}
}

/*
汇编结果
c[i] = a[i] + b[i];
mov ecx, DWORD PTR _b$[ebp+eax]
add DWORD PTR _a$[ebp+eax], ecx
c[i + 1] = a[i + 1] + b[i + 1];
mov ecx, DWORD PTR _a$[ebp+eax+4]
add DWORD PTR _b$[ebp+eax+4], ecx
c[i + 2] = a[i + 2] + b[i + 2];
mov ecx, DWORD PTR _a$[ebp+eax+8]
add DWORD PTR _b$[ebp+eax+8], ecx
c[i + 3] = a[i + 3] + b[i + 3];
mov ecx, DWORD PTR _a$[ebp+eax+12]
add DWORD PTR _b$[ebp+eax+12], ecx
*/

// 不可自动向量化(1203),循环体内包含了非连续内存访问
// 将数组 x[128 * 3] 拆分为 b[128], g[128], r[128] 可实现自动向量化,见 loop4
void loop3()
{
// 该函数实现了 y=Ax 运算,x,y为3*1向量,A为3*3矩阵
// 这是图像处理基本操作,如图像变换,RGB调节等
int x[128 * 3];
int y[128 * 3];
int A[9];
for (int i = 0; i < 128; ++i)
{
y[i * 3 + 0] = x[i * 3 + 0] * A[0] + x[i * 3 + 1] * A[1] + x[i * 3 + 2] * A[2];
y[i * 3 + 1] = x[i * 3 + 0] * A[3] + x[i * 3 + 1] * A[4] + x[i * 3 + 2] * A[5];
y[i * 3 + 2] = x[i * 3 + 0] * A[6] + x[i * 3 + 1] * A[6] + x[i * 3 + 2] * A[6];
}

}

/*
汇编结果
y[i * 3 + 0] = x[i * 3 + 0] * A[0] + x[i * 3 + 1] * A[1] + x[i * 3 + 2] * A[2];
mov edi, DWORD PTR _x$[ebp+eax+4]
mov ecx, edi
imul ecx, DWORD PTR _A$[ebp+4]
mov esi, DWORD PTR _x$[ebp+eax]
mov edx, esi
imul edx, DWORD PTR _A$[ebp]
mov ebx, DWORD PTR _x$[ebp+eax+8]
add edx, ecx
mov ecx, ebx
imul ecx, DWORD PTR _A$[ebp+8]
add edx, ecx
y[i * 3 + 1] = x[i * 3 + 0] * A[3] + x[i * 3 + 1] * A[4] + x[i * 3 + 2] * A[5];
mov ecx, edi
imul ecx, DWORD PTR _A$[ebp+16]
mov DWORD PTR _y$[ebp+eax], edx
mov edx, esi
imul edx, DWORD PTR _A$[ebp+12]
add edx, ecx
mov ecx, ebx
imul ecx, DWORD PTR _A$[ebp+20]
add edx, ecx
y[i * 3 + 2] = x[i * 3 + 0] * A[6] + x[i * 3 + 1] * A[6] + x[i * 3 + 2] * A[6];
lea ecx, DWORD PTR [esi+edi]
add ecx, ebx
mov DWORD PTR _y$[ebp+eax+4], edx
imul ecx, DWORD PTR _A$[ebp+24]
mov DWORD PTR _y$[ebp+eax+8], ecx
add eax, 12 ; 0000000cH
cmp eax, 1536 ; 00000600H
jl SHORT $LL4@loop3
*/

// 可自动向量化
// loop4_ don't need more space to split image data, and memory consitstent is better
void loop4()
{
int b[128], g[128], r[128];
int A[9];
for (int i = 0; i < 128; ++i)
{
int _b = b[i] * A[0] + g[i] * A[1] + r[i] * A[2];
int _g = b[i] * A[3] + g[i] * A[4] + r[i] * A[5];
int _r = b[i] * A[6] + g[i] * A[7] + r[i] * A[8];

b[i] = _b;
g[i] = _g;
r[i] = _r;
}
}

// better than loop4
void loop4_()
{

int bgr[128 * 3];
int b[8];
int g[8];
int r[8];
int A[9];
for (int i = 0; i < 128 / 8; ++i)
{
// split data
int start = i * 8 * 3;
for (int j = 0; j < 8; ++j)
{
b[j] = bgr[start + j * 3];
g[j] = bgr[start + j * 3 + 1];
r[j] = bgr[start + j * 3 + 2];
}

// adjust
for (int j = 0; j < 8; ++j)
{
int _b = b[j] * A[0] + g[j] * A[1] + r[j] * A[2];
int _g = b[j] * A[3] + g[j] * A[4] + r[j] * A[5];
int _r = b[j] * A[6] + g[j] * A[7] + r[j] * A[8];

b[j] = _b;
g[j] = _g;
r[j] = _r;
}

// merge data
for (int j = 0; j < 8; ++j)
{
bgr[start + j * 3] = b[j];
bgr[start + j * 3 + 1] = g[j];
bgr[start + j * 3 + 2] = r[j];
}
}

}

/*
向量化结果
vpbroadcastd ymm1, DWORD PTR _A$[ebp]
vpbroadcastd ymm6, DWORD PTR _A$[ebp+8]
vpbroadcastd ymm7, DWORD PTR _A$[ebp+4]
vmovdqu YMMWORD PTR tv886[ebp], ymm1
vpbroadcastd ymm0, DWORD PTR _A$[ebp+20]
vmovdqu YMMWORD PTR tv881[ebp], ymm0
vpbroadcastd ymm0, DWORD PTR _A$[ebp+16]
vmovdqu YMMWORD PTR tv882[ebp], ymm0
vpbroadcastd ymm0, DWORD PTR _A$[ebp+12]
vmovdqu YMMWORD PTR tv883[ebp], ymm0
vpbroadcastd ymm0, DWORD PTR _A$[ebp+32]
vmovdqu YMMWORD PTR tv878[ebp], ymm0
vpbroadcastd ymm0, DWORD PTR _A$[ebp+28]
vmovdqu YMMWORD PTR tv879[ebp], ymm0
vpbroadcastd ymm0, DWORD PTR _A$[ebp+24]
vmovdqu YMMWORD PTR tv880[ebp], ymm0
xor eax, eax
npad 2
$LL4@loop3_:
vmovdqu ymm3, YMMWORD PTR _b$[ebp+eax]
vmovdqu ymm4, YMMWORD PTR _g$[ebp+eax]
vmovdqu ymm5, YMMWORD PTR _r$[ebp+eax]
vpmulld ymm2, ymm3, ymm1
vpmulld ymm1, ymm4, ymm7
vpmulld ymm0, ymm5, ymm6
vpaddd ymm0, ymm0, ymm1

vpmulld ymm1, ymm4, YMMWORD PTR tv882[ebp]
vpaddd ymm0, ymm0, ymm2
vpmulld ymm2, ymm3, YMMWORD PTR tv883[ebp]
vmovdqu YMMWORD PTR __b$[ebp+eax], ymm0
vpmulld ymm0, ymm5, YMMWORD PTR tv881[ebp]
vpaddd ymm0, ymm0, ymm1

vpmulld ymm1, ymm4, YMMWORD PTR tv879[ebp]
vmovdqu ymm4, YMMWORD PTR _g$[ebp+eax+32]
vpaddd ymm0, ymm0, ymm2
vpmulld ymm2, ymm3, YMMWORD PTR tv880[ebp]
vmovdqu ymm3, YMMWORD PTR _b$[ebp+eax+32]
vmovdqu YMMWORD PTR __g$[ebp+eax], ymm0
vpmulld ymm0, ymm5, YMMWORD PTR tv878[ebp]
vmovdqu ymm5, YMMWORD PTR _r$[ebp+eax+32]
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
vpmulld ymm2, ymm3, YMMWORD PTR tv886[ebp]
vmovdqu YMMWORD PTR __r$[ebp+eax], ymm0
vpmulld ymm1, ymm4, ymm7
vpmulld ymm0, ymm5, ymm6
vpaddd ymm0, ymm0, ymm1
vpmulld ymm1, ymm4, YMMWORD PTR tv882[ebp]
vpaddd ymm0, ymm0, ymm2
vpmulld ymm2, ymm3, YMMWORD PTR tv883[ebp]
vmovdqu YMMWORD PTR __b$[ebp+eax+32], ymm0
vpmulld ymm0, ymm5, YMMWORD PTR tv881[ebp]
vpaddd ymm0, ymm0, ymm1
vpmulld ymm1, ymm4, YMMWORD PTR tv879[ebp]
vpaddd ymm0, ymm0, ymm2
vpmulld ymm2, ymm3, YMMWORD PTR tv880[ebp]
vmovdqu YMMWORD PTR __g$[ebp+eax+32], ymm0
vpmulld ymm0, ymm5, YMMWORD PTR tv878[ebp]
vpaddd ymm0, ymm0, ymm1
vmovdqu ymm1, YMMWORD PTR tv886[ebp]
vpaddd ymm0, ymm0, ymm2
vmovdqu YMMWORD PTR __r$[ebp+eax+32], ymm0
add eax, 64 ; 00000040H
cmp eax, 512 ; 00000200H
jl $LL4@loop3_
*/

// 内层for循环期望向量化,VS编译器没有自动向量化(1505)
// 内层循环中使用变量i使得编译器无法优化,loop6使用step使内层循环可优化
void loop5()
{
int *x = new int[1024 * 1024];
int *y = new int[1024 * 1024];
int *z = new int[1024 * 1024];
for (int i = 0; i < 1024; ++i)
{
for (int j = 0; j < 1024; ++j)
{
z[i * 1024 + j] = x[i * 1024 + j] + y[i * 1024 + j];
}
}

delete[]x;
delete[]y;
delete[]z;
}

// 内层for循环自动向量化
void loop6()
{
int *x = new int[1024 * 1024];
int *y = new int[1024 * 1024];
int *z = new int[1024 * 1024];
for (int i = 0; i < 1024; ++i)
{
int step = i * 1024;
for (int j = 0; j < 1024; ++j)
{
z[step + j] = x[step + j] + y[step + j];
}
}

delete []x;
delete []y;
delete []z;

}

// vs编译器提示循环体内包含较少计算,当加入更多计算时提示循环体内包含了非连续内存访问(1203)
// 通过展开内层循环可以实现优化操作,见loop8
// 该函数模拟图像相加操作
void loop7()
{
int *x = new int[1024 * 1024 * 3];
int *y = new int[1024 * 1024 * 3];
int *z = new int[1024 * 1024 * 3];
for (int i = 0; i < 1024; ++i)
{
int step = i * 1024 * 3;
for (int j = 0; j < 1024; ++j)
{
z[step + j * 3 ] = x[step + j * 3 ] + y[step + j * 3 ];
z[step + j * 3 + 1] = x[step + j * 3 + 1] + y[step + j * 3 + 1];
z[step + j * 3 + 2] = x[step + j * 3 + 2] + y[step + j * 3 + 2];
}
}

delete[]x;
delete[]y;
delete[]z;
}

// 可自动向量化化
void loop8()
{
unsigned char *x = new unsigned char[1024 * 1024 * 3];
unsigned char *y = new unsigned char[1024 * 1024 * 3];
unsigned char *z = new unsigned char[1024 * 1024 * 3];
for (int i = 0; i < 1024; ++i)
{
int step = i * 1024 * 3;
for (int j = 0; j < 1024 * 3; ++j)
{
z[step + j] = x[step + j] + y[step + j];
}
}

delete[]x;
delete[]y;
delete[]z;
}
/////////////////////////////////////////////////