我必须承认我不太熟悉C。在尝试将Perl模块LCS::BV移植到C并调整我在一个会话中获得的代码的速度时,每秒发现5 G迭代的速度令人惊讶(相比之前的12 M / s)。
现在隔离原因期间,在Intel Core i7-4770HQ上,差异为〜50 G / s至〜70 M / s。
为确保clang不会展开基准测试的循环,我可以使用_mm_popcnt_u64(~v)
然后进行反编译:
$ otool -tv lcstest > lcstest.dump.txt
$ grep popcnt lcstest.dump.txt
0000000100000e26 popcntq %rax,%rax
我不确定基准代码或方法是否有问题。请参见带注释的代码(也可以在GitHub上找到):
#include <stdio.h>
#include <limits.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <nmmintrin.h>
static const uint64_t width = 64;
int count_bits(uint64_t bits) {
bits = bits - ((bits >> 1) & 0x5555555555555555ull);
bits = (bits & 0x3333333333333333ull) + ((bits >> 2) & 0x3333333333333333ull);
// (bytesof(bits) -1) * bitsofbyte = (8-1)*8 = 56 -------------------------------vv
return ((bits + (bits >> 4) & 0x0f0f0f0f0f0f0f0full) * 0x0101010101010101ull) >> 56;
}
int llcs_asci (char * a,char * b,uint32_t alen,uint32_t blen) {
// static uint64_t posbits[128] = { 0 }; // 73.4 (M/sec)
// uint64_t posbits[128] = { 0 }; // 53050.4 (M/sec)
uint64_t posbits[128]; // 56338.0 (M/sec)
uint64_t i;
for (i=0; i < 128; i++) { posbits[i] = 0; } // needed
for (i=0; i < alen; i++) {
posbits[(unsigned int)a[i]] |= 0x1ull << (i % width);
}
uint64_t v = ~0ull;
for (i=0; i < blen; i++) {
uint64_t p = posbits[(unsigned int)b[i]];
uint64_t u = v & p;
v = (v + u) | (v - u);
}
return count_bits(~v); // portable
//return _mm_popcnt_u64(~v);
}
int main (void) {
clock_t tic;
clock_t toc;
double elapsed;
double rate;
uint64_t count;
uint64_t megacount;
uint32_t iters = 1000000;
uint32_t megaiters = 1;
// m=10,n=11,llcs=7,d=4,sim=0.667
char str1[] = "Choerephon";
char str2[] = "Chrerrplzon";
uint32_t len1 = strlen(str1);
uint32_t len2 = strlen(str2);
int length_lcs;
/* ########## llcs_asci ########## */
tic = clock();
megaiters = 20;
for (megacount = 0; megacount < megaiters; megacount++) {
for (count = 0; count < iters; count++) {
length_lcs = llcs_asci (str1,str2,len1,len2);
}
}
toc = clock();
elapsed = (double)(toc - tic) / (double)CLOCKS_PER_SEC;
rate = (double)megaiters / (double)elapsed;
// need to use the result to avoid loop unrolling ---------------------vv
printf("[llcs_asci] iters: %u M Elapsed: %f s Rate: %.1f (M/sec) llcs: %u\n",megaiters,elapsed,rate,length_lcs);
/* #################### */
return 0;
}