给定以下函数
void foo(float* result, int size, float y, float delta) {
for (int t = 0; t < size; ++t) {
result[t] = y + delta * t;
}
}
Clang-O2
生成以下 x86-64 程序集:
.LCPI0_0:
.long 0
.long 1
.long 2
.long 3
.LCPI0_1:
.long 4
.long 4
.long 4
.long 4
.LCPI0_2:
.long 65535
.long 65535
.long 65535
.long 65535
.LCPI0_3:
.long 1258291200
.long 1258291200
.long 1258291200
.long 1258291200
.LCPI0_4:
.long 1392508928
.long 1392508928
.long 1392508928
.long 1392508928
.LCPI0_5:
.long 0x53000080
.long 0x53000080
.long 0x53000080
.long 0x53000080
.LCPI0_6:
.long 8
.long 8
.long 8
.long 8
foo(float*, int, float, float):
test esi, esi
jle .LBB0_7
mov eax, esi
cmp esi, 7
ja .LBB0_3
xor ecx, ecx
jmp .LBB0_6
.LBB0_3:
mov ecx, eax
and ecx, 2147483640
movaps xmm2, xmm1
shufps xmm2, xmm1, 0
movaps xmm3, xmm0
shufps xmm3, xmm0, 0
mov edx, eax
shr edx, 3
and edx, 268435455
shl rdx, 5
movdqa xmm4, xmmword ptr [rip + .LCPI0_0]
xor esi, esi
movdqa xmm5, xmmword ptr [rip + .LCPI0_1]
movdqa xmm6, xmmword ptr [rip + .LCPI0_2]
movdqa xmm7, xmmword ptr [rip + .LCPI0_3]
movdqa xmm8, xmmword ptr [rip + .LCPI0_4]
movaps xmm9, xmmword ptr [rip + .LCPI0_5]
movdqa xmm10, xmmword ptr [rip + .LCPI0_6]
.LBB0_4:
movdqa xmm11, xmm4
paddd xmm11, xmm5
movdqa xmm12, xmm4
pand xmm12, xmm6
por xmm12, xmm7
movdqa xmm13, xmm4
psrld xmm13, 16
por xmm13, xmm8
subps xmm13, xmm9
addps xmm13, xmm12
movdqa xmm12, xmm11
pand xmm12, xmm6
por xmm12, xmm7
psrld xmm11, 16
por xmm11, xmm8
subps xmm11, xmm9
addps xmm11, xmm12
mulps xmm13, xmm2
addps xmm13, xmm3
mulps xmm11, xmm2
addps xmm11, xmm3
movups xmmword ptr [rdi + rsi], xmm13
movups xmmword ptr [rdi + rsi + 16], xmm11
paddd xmm4, xmm10
add rsi, 32
cmp rdx, rsi
jne .LBB0_4
cmp ecx, eax
je .LBB0_7
.LBB0_6:
xorps xmm2, xmm2
cvtsi2ss xmm2, ecx
mulss xmm2, xmm1
addss xmm2, xmm0
movss dword ptr [rdi + 4*rcx], xmm2
inc rcx
cmp rax, rcx
jne .LBB0_6
.LBB0_7:
ret
我试图了解这里发生了什么。这似乎.LBB0_4
是一个循环,每次迭代都覆盖原始循环的 8 次迭代(有 2mulps
条指令,每条指令覆盖 4float
次并rsi
增加 32)。最后的代码可能是为了解决size
不能被 8 整除的情况。我遇到麻烦的是其余的代码。循环内的所有其他指令.LBB0_4
和开头的常量在做什么?是否有工具或编译器参数可以帮助我理解 SIMD 矢量化的结果?也许可以使用 SIMD 内在函数将其转换回 C++?
如果我将代码改为这样
void foo(float* result, int size, float y, float delta) {
for (int t = 0; t < size; ++t) {
result[t] = y;
y += delta;
}
}
编辑:我刚刚意识到这个版本根本没有矢量化,因此更小而且可能更慢。
编写此代码的最快方法是什么?