计算受标量积的启发。 C版本:
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>
int main() {
clock_t t0,t1;
int64_t n = 10000000; // 10 million
int64_t *m = (int64_t*) malloc ((20*n) * sizeof(int64_t));
for (int64_t i = 0; i < n; i++) {
m[i] = rand() % 1000;
}
int64_t sum = 0;
int64_t k = 0;
t0 = clock();
for (int64_t i = 0; i < n; i++) {
sum += m[k]*m[k+1] + m[k+2]*m[k+3] + m[k+4]*m[k+5] + m[k+6]*m[k+7] + m[k+8]*m[k+9] + m[k+10]*m[k+11] + m[k+12]*m[k+13] + m[k+14]*m[k+15] + m[k+16]*m[k+17] + m[k+18]*m[k+19];
k += 20;
}
t1 = clock();
printf("%" PRId64 "\n",sum);
float diff = ((float) (t1 - t0) / 1000000.0F ) * 1000;
printf("%f",(float) diff);
}
常用版本:
主文件:
format PE64 console
entry prog
include "win64ax.inc"
section '.idata' import data readable writeable
library kernel32,'kernel32.dll',msvcrt,'msvcrt.dll'
import kernel32,ExitProcess,'ExitProcess',GetTickCount,'GetTickCount'
import msvcrt,printf,'printf'
section '.text' code readable executable
macro now
{
cinvoke GetTickCount
cinvoke printf,<"%lld",13,10>,rax
}
prog:
mov r14,temp
mov r15,10000000 ; 10 million
now
lbl:
dec r15
include "temp_code.asm"
add r14,80
cmp r15,0
jne lbl
now
end_prog:
invoke ExitProcess,0
section '.data' data readable writeable
align 8
temp dq 100000200 dup(0)
temp_code.asm:
mov rbx,0
mov rax,[r14 + 0 * 8]
mov rcx,[r14 + 1 * 8]
imul rax,rcx
add rbx,rax
mov rax,[r14 + 2 * 8]
mov rcx,[r14 + 3 * 8]
imul rax,[r14 + 4 * 8]
mov rcx,[r14 + 5 * 8]
imul rax,[r14 + 6 * 8]
mov rcx,[r14 + 7 * 8]
imul rax,[r14 + 8 * 8]
mov rcx,[r14 + 9 * 8]
imul rax,[r14 + 10 * 8]
mov rcx,[r14 + 11 * 8]
imul rax,[r14 + 12 * 8]
mov rcx,[r14 + 13 * 8]
imul rax,[r14 + 14 * 8]
mov rcx,[r14 + 15 * 8]
imul rax,[r14 + 16 * 8]
mov rcx,[r14 + 17 * 8]
imul rax,[r14 + 18 * 8]
mov rcx,[r14 + 19 * 8]
imul rax,rbx
fasm的最佳时间是93毫秒,带有编译选项“ gcc.exe -std = c99 -g 1.c -O3 -o 1.exe”的c的最佳时间是710毫秒。这慢了7.63倍。 我听说过数百次手写程序集无法击败c编译器,而现在……这是超级跑车的速度对赛跑者的速度。您的解释?