#include
#include
#include
#include
#include
/**
* @brief 通过迭代查找1累加,多少位就循环多少次,简单明了也因此最慢。
* @param n :
* @return uint32_t:
*/
uint32_t iterated_popcnt(uint32_t n)
{
uint32_t count = ;
for(; n; n >>= )
count += n&1u;
return count;
}
/**
* @brief 是对 iterated_popcnt 的改进,每次迭代总是将最右边的非零位置零。减法的妙用。
* 试想一下,一个仅最高位为1的整数,用此方法的话仅需一次迭代;而 iterated_popcnt 还是会“乖乖的”迭代64次。
* @param n :
* @return uint32_t:
*/
uint32_t sparse_popcnt(uint32_t n)
{
uint32_t count = ;
while(n)
{
++count;
n &= n-;
}
return count;
}
uint32_t dense_popcnt(uint32_t n)
{
uint32_t count = CHAR_BIT * sizeof(uint32_t);
n ^= static_cast
while(n)
{
--count;
n &= n-;
}
return count;
}
#ifdef USE_MACRO
uint32_t lookup_popcnt(uint32_t n)
{
assert(CHAR\_BIT == );
static const uint8\_t TABLE\[\] = { BIT8() };
return
TABLE\[(n ) & UCHAR\_MAX\] +
TABLE\[(n>> ) & UCHAR\_MAX\] +
TABLE\[(n>>) & UCHAR\_MAX\] +
TABLE\[(n>>) & UCHAR\_MAX\];
}
#else
const size_t TBL_LEN = 1u << CHAR_BIT;
static uint8_t TABLE[TBL_LEN] = {};
uint32_t lookup_popcnt(uint32_t n)
{
uint8_t *p = reinterpret_cast
return TABLE[p[]] + TABLE[p[]] + TABLE[p[]] + TABLE[p[]];
}
#endif /* USE_MACRO */
#define POW2(c) (1u << (c)) #define MASK(c) (UINT_MAX / (POW2(POW2(c)) + 1u)) #define COUNT(x, c) (((x) & MASK(c)) + (((x)>>POW2(c)) & MASK(c)))
uint32_t parallel_popcnt(uint32_t n)
{
n = COUNT(n, );
n = COUNT(n, );
n = COUNT(n, );
n = COUNT(n, );
n = COUNT(n, );
// n = COUNT(n, 5); for 64-bit integers
return n;
}
#define MASK_01010101 (((unsigned int)(-1))/3)
#define MASK_00110011 (((unsigned int)(-1))/5)
#define MASK_00001111 (((unsigned int)(-1))/17)
/**
* @brief 一个2位数 ab = 2a + b
* 第1步相当于: b + a
*
* 如果是4位数 abcd = (2a + b)<<2 + (2c + d)
\* 第1步相当于: (a + b)<<2 + (c + d)
\* 第2步相当于: (a + b + c + d)
\*
\* 如果是8位数 abcd efgh
\* 第2步相当于: (a + b + c + d)<<4 + (e + f + g + h)
\* 第3步相当于: sum(a:h)
\*
\* 如果是16位数 a~h i~p
\* 第3步相当于 sum(a:h)<<8 + sum(i:p)
\*
\* 这里用到一个结论:一个K进制的数 取模 (K-1) 的结果是K进制的各位数相加后再取模 (K-1)
\* 所以迭代了3次相当于把 n 视为一个256进制的数(8位为一组)
\* 而每组最多有8个1,所以可以最多 256/8 = 32组,也就是最大255位二进制
\*
\* 只是取模运算内耗多少就不得而知了。
\*
\* @param n :
\* @return uint32\_t:
\*/
uint32\_t nifty\_popcnt(uint32\_t n)
{
n = (n & MASK\_01010101) + ((n>>) & MASK_01010101);
n = (n & MASK_00110011) + ((n>>) & MASK_00110011);
n = (n & MASK_00001111) + ((n>>) & MASK_00001111);
return n% ;
}
/**
* @brief 如果是2位数: ab = 2a + b
* 第1步相当于: ab = (2a + b) - a = a + b
*
* 如果是4位数: abcd
* 第1步相当于: abcd = (a + b)<<2 + (c + d)
\* 第2步相当于: abcd = sum(a:d)
\*
\* 如果是16位数: a~h i~p
\* 第3步相当于: a~p = sum(a:h)<<8 + sum(i:p)
\* 第4步相当于: a~p = sum(a:p)
\*
\* 如果是32位数: a~p p~a
\* 第4步相当于: a~p p~a = sum(a:p)<<16 + sum(p:a)
\* 第5步相当于: sum(a:p) + sum(p:a)
\* @param n :
\* @return uint32\_t:
\*/
uint32\_t hacker\_popcnt(uint32\_t n)
{
n -= (n>>) & 0x55555555;
n = (n & 0x33333333) + ((n>>) & 0x33333333);
n = ((n>>) + n) & 0x0F0F0F0F;
n += n>>;
n += n>>;
return n & 0x0000003F;
}
/* HAKMEM Popcount
Consider a 3 bit number as being
4a+2b+c
if we shift it right 1 bit, we have
2a+b
subtracting this from the original gives
2a+b+c
if we shift the original 2 bits right we get
a
and so with another subtraction we have
a+b+c
which is the number of bits in the original number.
Suitable masking allows the sums of the octal digits in a 32 bit number to
appear in each octal digit. This isn't much help unless we can get all of
them summed together. This can be done by modulo arithmetic (sum the digits
in a number by molulo the base of the number minus one) the old "casting out
nines" trick they taught in school before calculators were invented. Now,
using mod 7 wont help us, because our number will very likely have more than 7
bits set. So add the octal digits together to get base64 digits, and use
modulo 63. (Those of you with 64 bit machines need to add 3 octal digits
together to get base512 digits, and use mod 511.)
This is HACKMEM 169, as used in X11 sources.
Source: MIT AI Lab memo, late 1970's.
*/
uint32_t hakmem_popcnt(uint32_t n)
{
uint32_t tmp = n - ((n>>)&) - ((n>>)&);
return ((tmp+(tmp>>)) & ) % ;
}
uint32_t assembly_popcnt(uint32_t n)
{
/*
asm("popcnt %0,%%eax"::"r"(n)); // Intel style
__asm popcnt eax,n; // AT&T style
The two instructions above are functionally equivalent, and both will
generate warning "no return statement" if you enable all the warnings.
A caveat applies here: Don't clobber your registers!
What, unfamiliar with inline assembly code?
It's time to get your hands dirty.
http://msdn.microsoft.com/en-us/library/4ks26t93(v=vs.110).aspx
http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
*/
#ifdef _MSC_VER /* use Intel style assembly */
__asm popcnt eax,n;
// The function does return a value in EAX
#elif __GNUC__ /* use AT&T style assembly */
register int result; // Hey, it's my first time to use the keyword register!
asm("popcnt %1,%0":"=r"(result):"r"(n)); // probably generates "popcnt eax,eax"
return result;
#else
#endif
}
int main(void)
{
#if !defined(USE_MACRO)
// generate the table algorithmically
for(size_t i = ; i < TBL_LEN; ++i)
TABLE[i] = TABLE[i>>] + (i&);
#endif
typedef uint32\_t (\*FUNC\_POPCNT)(uint32\_t);
const struct Pair
{
FUNC\_POPCNT pfunc;
const char\* name;
} METHOD\[\] =
{
#define ELEMENT(n) {(n), #n}
ELEMENT(iterated_popcnt),
ELEMENT( sparse_popcnt),
ELEMENT( dense_popcnt),
ELEMENT( lookup_popcnt),
ELEMENT(parallel_popcnt),
ELEMENT( nifty_popcnt),
ELEMENT( hacker_popcnt),
ELEMENT( hakmem_popcnt),
ELEMENT(assembly_popcnt)
#undef ELEMENT
};
const uint32\_t NUM = 0x10000000;//0xDEADBEAF;
printf("after iterating %u times,\\n", NUM);
time\_t start, stop;
for(uint32\_t i=; i<sizeof(METHOD)/sizeof(METHOD\[\]); ++i)
{
start = clock();
for(uint32\_t j = ; j < NUM; ++j)
METHOD\[i\].pfunc(j);
stop = clock();
double elapsed\_time = static\_cast<double>(stop - start)/CLOCKS\_PER\_SEC/NUM;
printf("%u. method %15s uses %gs\\n", i, METHOD\[i\].name, elapsed\_time);
}
return ;
}
/*
http://resnet.uoregon.edu/~gurney_j/jmpc/bitwise.html
*/
#define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
#define BX_(x) ((x) - (((x)>>1)&0x77777777) - (((x)>>2)&0x33333333) - (((x)>>3)&0x11111111))
以一个4bit的数据x = abcd为例,其中a~d为0或1。
则
x = 8a + 4b + 2c+ d
x>>1 = 4a + 2b + c
x>>2 = 2a + b
x>>3 = a
故
BX_(x) = x - (x>>1) - (x>>2) - (x>>3) = a + b + c +d
所以,BX_(X) 可以看作低4位的BIT_COUNT运算。
进一步,如果是一个8位的ABCD0000进行BX_()运算:
BX_(ABCD0000) = ABCD0000 - (ABC 0000) - (AB 0000) - (A 0000)
= (A+B+C+D)<<4
所以,写到这里BX_(x)的功能基本既可以猜出来了,它以4bit为一个单位进行BIT_COUNT,在此基础上16进制。
上例中BX_(x)最多可以支持0xffff_ffff,即32位的数据
那么可以猜测一下,BITCOUNT()应该实现如下功能:
BITCOUNT(x) = sum( 0x000_000f & (BX_(x) >> (i*4)) ),其中sum()表示求和运算,i的取值为0~7。
下面来看BITCOUNT(x)的定义,比较简单:
#define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
先来假设x是一个8位的数,上式就括号里有作用,显然成立,相当于:
#define BITCOUNT(x) (BX_(x)+(BX_(x)>>4)
再来看如果x是一个16位的数,比如abcd,其中a~d代表一个4bit的数。
并且BX_(abcd) = efgh, BITCOUNT 应该等于 e+f+g+h。
则BITCOUNT(abcd) = ((efgh + 0efg ) & 0x0f0f) %0xff
= 0j0k % 0xff ,其中j= e+f, k = g+h
= j+k = e+f+g+h,得证。
如果,x是一个32位的数呢,比如ijkl_mnop,则BX_(ijkl_mnop)=abcd_efgh,需要证明 BITCOUNT = sum(a,h)
BITCOUNT(x) = ((abcd_efgh + 0abc_defg) & 0x0f0f_0f0f) % 0xff
= 0q0r0s0t % 0xff, 其中 q=a+b,r=c+d, s=e+f, t=g+h
= q+r+s+t = sum(a,h),从而得证。
可以进一步思考,仅从BX_(x)的0x77777777等考虑支持32位数据,但从取余255考虑那该支持多少位呢,哈哈。
手机扫一扫
移动阅读更方便
你可能感兴趣的文章