我正在尝试验证仅包含ASCII可见字符,空格和\ t的字符串。
但是,在大多数CPU上,ASCII表查找似乎比带有_SIDD_CMP_RANGES的_mm_cmpestri指令快。 我已经在i5-2410M,i7-3720QM,i7-5600U和KVM虚拟化Xeon未知类型上进行了测试,只有在最后一个版本上,矢量化版本才更快。
我的测试代码在这里:
(
[09:00] => stdClass Object
(
[duration] => 40
)
[10:00] => stdClass Object
(
[duration] => 60
)
[11:15] =>
[11:30] =>
[11:45] => stdClass Object
(
[duration] => 45
)
)
使用#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <immintrin.h>
#include <stdalign.h>
#include <stdlib.h>
#define MIN(a,b) (((a)<(b))?(a):(b))
#define ALIGNED16 alignas(16)
#define MEASURE(msg,stmt) { \
struct timeval tv; \
gettimeofday(&tv,NULL); \
uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
stmt; \
gettimeofday(&tv,NULL); \
uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
printf("%-20s - %.4fms\n",msg,((double)us2 - us1) / 1000); \
}
// Character table
#define VWSCHAR(c) (vis_ws_chars[(unsigned char)(c)]) // Visible characters and white space
#define YES 1,#define NO 0,#define YES16 YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16 NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128 NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16
// Visible ASCII characters with space and tab
ALIGNED16 static const int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
NO NO NO NO NO NO NO NO NO YES NO NO NO NO NO NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
NO16
// SP ! " # $ % & ' ( ) * +,- . /
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// @ A B C D E F G H I J K L M N O
// P Q R S T U V W X Y Z [ \ ] ^ _
// ` a b c d e f g h i j k l m n o
YES16 YES16 YES16 YES16 YES16
// p q r s t u v w x y z { | } ~ DEL
YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
NO128
};
size_t search_logic(const char* data,size_t len) {
__m128i ht = _mm_set1_epi8('\t');
//__m128i del = _mm_set1_epi8(0x7f);
__m128i td = _mm_set1_epi8('~');
__m128i sp_m1 = _mm_set1_epi8(' ' - 1);
size_t i = 0;
while (len - i >= 16) {
__m128i c = _mm_loadu_si128((const __m128i *) (data + i));
// (!((c < del) && (c >= sp)) && (c != ht)) == 0
//if(!_mm_testc_si128(_mm_and_si128(_mm_cmpgt_epi8(c,sp_m1),_mm_cmplt_epi8(c,del)),_mm_xor_si128(c,ht)))
//break;
// !(c == del) && ((c == ht) || (c >= sp)) == 1
//if(!_mm_test_all_ones(_mm_andnot_si128(_mm_cmpeq_epi8(c,del),_mm_or_si128(_mm_cmpeq_epi8(c,ht),_mm_cmpgt_epi8(c,sp_m1)))))
//break;
// (((c != ht) && (c >= sp)) && (c > td)) == 0
if(!_mm_test_all_zeros(_mm_and_si128(_mm_xor_si128(c,sp_m1)),td)))
break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_table(const char* data,size_t len)
{
// Search non-matching character via table lookups
size_t i = 0;
while (len - i >= 16) {
if (!VWSCHAR(data[i + 0])) break;
if (!VWSCHAR(data[i + 1])) break;
if (!VWSCHAR(data[i + 2])) break;
if (!VWSCHAR(data[i + 3])) break;
if (!VWSCHAR(data[i + 4])) break;
if (!VWSCHAR(data[i + 5])) break;
if (!VWSCHAR(data[i + 6])) break;
if (!VWSCHAR(data[i + 7])) break;
if (!VWSCHAR(data[i + 8])) break;
if (!VWSCHAR(data[i + 9])) break;
if (!VWSCHAR(data[i + 10])) break;
if (!VWSCHAR(data[i + 11])) break;
if (!VWSCHAR(data[i + 12])) break;
if (!VWSCHAR(data[i + 13])) break;
if (!VWSCHAR(data[i + 14])) break;
if (!VWSCHAR(data[i + 15])) break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr(const char* data,size_t len)
{
static const char legal_ranges[16] = {
'\t','\t',' ','~',};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpestri(v1,4,v2,16,_SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr_implicit(const char* data,};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1,_SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
int main()
{
printf("Setting up 1GB of data...\n");
size_t len = 1024 * 1024 * 1024 + 3;
char* data = (char*)mmap(NULL,len,PROT_READ|PROT_WRITE,MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE,-1,0); // Aligned
srand(0);
for (size_t i = 0; i < len; ++i) {
const char v = rand() % 96;
data[i] = v == 95 ? '\t' : ' ' + v;
}
size_t end = len - 2;
data[end] = '\n'; // Illegal character to be found
MEASURE("table lookup",{
size_t i = search_table(data,len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu",i,end);
});
MEASURE("cmpestr ranges",{
size_t i = search_sse4cmpstr(data,end);
});
MEASURE("cmpistr ranges",{
size_t i = search_sse4cmpstr_implicit(data,end);
});
MEASURE("logic ranges",{
size_t i = search_logic(data,end);
});
}
进行编译,可以得到以下结果:
gcc -O3 -march=native -pedantic -Wall -Wextra main2.cpp
我还检查了汇编输出,当search_table未向量化时,search_sse4cmpstr使用vpcmpestri。
我使用错了吗?还是为什么这条指令根本不存在?
编辑: 正如评论中指出的那样,cmpistr(带有较少参数的隐式长度指令)比cmpestr稍快,有时甚至比表查找快。
但是,SSE2的按位和整数运算似乎更快。
EDIT2 彼得·科德斯(Peter Cordes)找到了正确的答案。 我已经在新答案中添加了修改后的程序,因此,如果您对cmpstr感兴趣,请查看此程序。
请勿使用上面的代码!