4.59
4.47 is the better one
loop part in 4.47
L4:
mrmovq 8(%rax), %r9
mrmovq (%rax), %r10
rrmovq %r9, %r8
subq %r10, %r8
jge L3
rmmovq %r10, 8(%rax)
rmmovq %r9, (%rax)
50% jge is right, run 5 instructions; 50% jge is wrong, run 7 instructions and 2
nop bubble. so Cycles Per Loop is 50% 5 + (7 + 2) 50% = 7
loop part in 4.48
L4:
mrmovq 8(%rax), %r9
mrmovq (%rax), %r10
rrmovq %r9, %r8
subq %r10, %r8
cmovl %r9, %r11
cmovl %r10, %r9
cmovl %r11, %r10
rmmovq %r9, 8(%rax)
rmmovq %r10, (%rax)
Cycles Per Loop is 9
loop part in 4.49
L4:
mrmovq 8(%rax), %r9
mrmovq (%rax), %r10
rrmovq %r9, %r8
rrmovq %r10, %r11
xorq %r9, %r10
subq %r11, %r8
cmovge %r11, %r9
xorq %r10, %r9
xorq %r9, %r10
rmmovq %r9, 8(%rax)
rmmovq %r10, (%rax)
Cycles Per Loop is 11
当前内容版权归 DreamAndDead 或其关联方所有,如需对内容或内容相关联开源项目进行关注与资助,请访问 DreamAndDead .