dlvhex  2.5.0
vs10/bm/bmsse4.h
Go to the documentation of this file.
00001 #ifndef BMSSE4__H__INCLUDED__
00002 #define BMSSE4__H__INCLUDED__
00003 /*
00004 Copyright(c) 2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
00005 
00006 Permission is hereby granted, free of charge, to any person 
00007 obtaining a copy of this software and associated documentation 
00008 files (the "Software"), to deal in the Software without restriction, 
00009 including without limitation the rights to use, copy, modify, merge, 
00010 publish, distribute, sublicense, and/or sell copies of the Software, 
00011 and to permit persons to whom the Software is furnished to do so, 
00012 subject to the following conditions:
00013 
00014 The above copyright notice and this permission notice shall be included 
00015 in all copies or substantial portions of the Software.
00016 
00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
00023 OTHER DEALINGS IN THE SOFTWARE.
00024 
00025 For more information please visit:  http://bmagic.sourceforge.net
00026 
00027 */
00028 
00029 
00030 
00031 //    Header implements processor specific intrinsics declarations for SSE2
00032 //    instruction set
00033 #include<mmintrin.h>
00034 #include<emmintrin.h>
00035 #include<smmintrin.h>
00036 
00037 #include "bmdef.h"
00038 #include "bmsse_util.h"
00039 
00040 namespace bm
00041 {
00042 
00054 inline 
00055 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
00056 {
00057     bm::id_t count = 0;
00058 
00059 #ifdef BM64_SSE4
00060     do
00061     {
00062         __m128i tmp0 = _mm_load_si128(block);
00063         count += _mm_popcnt_u64(_mm_extract_epi64(tmp0, 0)) +
00064                  _mm_popcnt_u64(_mm_extract_epi64(tmp0, 1));
00065         __m128i tmp1 = _mm_load_si128(block+1);
00066         count += _mm_popcnt_u64(_mm_extract_epi64(tmp1, 0)) +
00067                  _mm_popcnt_u64(_mm_extract_epi64(tmp1, 1));
00068 
00069         block +=2;
00070     } while (block < block_end);
00071 
00072 #else
00073     do
00074     {
00075         const unsigned* b = (unsigned*) block;
00076         count += _mm_popcnt_u32(b[0]) +
00077                  _mm_popcnt_u32(b[1]) +
00078                  _mm_popcnt_u32(b[2]) +
00079                  _mm_popcnt_u32(b[3]);
00080     } while (++block < block_end);
00081 #endif    
00082     return count;
00083 }
00084 
00088 BMFORCEINLINE 
00089 unsigned op_xor(unsigned a, unsigned b)
00090 {
00091     unsigned ret = (a ^ b);
00092     return ret;
00093 }
00094 
00098 BMFORCEINLINE 
00099 unsigned op_or(unsigned a, unsigned b)
00100 {
00101     return (a | b);
00102 }
00103 
00107 BMFORCEINLINE 
00108 unsigned op_and(unsigned a, unsigned b)
00109 {
00110     return (a & b);
00111 }
00112 
00113 
00114 template<class Func>
00115 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block, 
00116                            const __m128i* BMRESTRICT block_end,
00117                            const __m128i* BMRESTRICT mask_block,
00118                            Func sse2_func)
00119 {
00120     bm::id_t count = 0;
00121 #ifdef BM64_SSE4
00122     do
00123     {
00124         __m128i tmp0 = _mm_load_si128(block);
00125         __m128i tmp1 = _mm_load_si128(mask_block);        
00126         __m128i b = sse2_func(tmp0, tmp1);
00127 
00128         count += _mm_popcnt_u64(_mm_extract_epi64(b, 0));
00129         count += _mm_popcnt_u64(_mm_extract_epi64(b, 1));
00130 
00131         ++block; ++mask_block;
00132     } while (block < block_end);
00133 #else    
00134     do
00135     {
00136         __m128i tmp0 = _mm_load_si128(block);
00137         __m128i tmp1 = _mm_load_si128(mask_block);        
00138         __m128i b = sse2_func(tmp0, tmp1);
00139 
00140         count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
00141         count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
00142         count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
00143         count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
00144 
00145         ++block; ++mask_block;
00146     } while (block < block_end);
00147 #endif
00148     
00149     return count;
00150 }
00151 
00152 /*
00153 template<class Func>
00154 bm::id_t sse4_bit_count_op2(const __m128i* BMRESTRICT block, 
00155                             const __m128i* BMRESTRICT block_end,
00156                             const __m128i* BMRESTRICT mask_block,
00157                            Func op_func)
00158 {
00159     bm::id_t count = 0;
00160 #ifdef BM64_SSE4    
00161     do
00162     {
00163         unsigned *r1 = (unsigned*) block;
00164         unsigned *r2 = (unsigned*) mask_block;
00165 
00166         count += _mm_popcnt_u32(op_func(r1[0], r2[0]));
00167         count += _mm_popcnt_u32(op_func(r1[1], r2[1]));
00168         count += _mm_popcnt_u32(op_func(r1[2], r2[2]));
00169         count += _mm_popcnt_u32(op_func(r1[3], r2[3]));
00170 
00171         ++mask_block;
00172 
00173     } while (++block < block_end);
00174 #else
00175     do
00176     {
00177         unsigned *r1 = (unsigned*) block;
00178         unsigned *r2 = (unsigned*) mask_block;
00179 
00180         count += _mm_popcnt_u32(op_func(r1[0], r2[0]));
00181         count += _mm_popcnt_u32(op_func(r1[1], r2[1]));
00182         count += _mm_popcnt_u32(op_func(r1[2], r2[2]));
00183         count += _mm_popcnt_u32(op_func(r1[3], r2[3]));
00184 
00185         ++mask_block;
00186 
00187     } while (++block < block_end);
00188 #endif    
00189     return count;
00190 
00191 }
00192 */
00193 
00194 
00195 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00196     sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00197 
00198 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00199     sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00200 
00201 #define VECT_BITCOUNT(first, last) \
00202     sse4_bit_count((__m128i*) (first), (__m128i*) (last)) 
00203 
00204 #define VECT_BITCOUNT_AND(first, last, mask) \
00205     sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) 
00206 
00207 #define VECT_BITCOUNT_OR(first, last, mask) \
00208     sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) 
00209 
00210 #define VECT_BITCOUNT_XOR(first, last, mask) \
00211     sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) 
00212 
00213 #define VECT_BITCOUNT_SUB(first, last, mask) \
00214     sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) 
00215 
00216 #define VECT_INVERT_ARR(first, last) \
00217     sse2_invert_arr(first, last);
00218 
00219 #define VECT_AND_ARR(dst, src, src_end) \
00220     sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00221 
00222 #define VECT_OR_ARR(dst, src, src_end) \
00223     sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00224 
00225 #define VECT_SUB_ARR(dst, src, src_end) \
00226     sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00227 
00228 #define VECT_XOR_ARR(dst, src, src_end) \
00229     sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00230 
00231 #define VECT_COPY_BLOCK(dst, src, src_end) \
00232     sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00233 
00234 #define VECT_SET_BLOCK(dst, dst_end, value) \
00235     sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00236 
00237 
00238 
00239 
00240 
00247 inline
00248 bm::id_t sse4_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
00249                                           const __m128i* BMRESTRICT block_end,
00250                                                unsigned* BMRESTRICT bit_count)
00251 {
00252 //   __m128i mask1 = _mm_set_epi32(0x1, 0x1, 0x1, 0x1);
00253    register int count = (block_end - block)*4; 
00254 
00255    register bm::word_t  w0, w_prev;
00256    const int w_shift = sizeof(w0) * 8 - 1;
00257    bool first_word = true;
00258    *bit_count = 0;
00259  
00260    // first word
00261    {
00262        bm::word_t  w;
00263        const bm::word_t* blk = (const bm::word_t*) block;
00264        w = w0 = blk[0];
00265        *bit_count += _mm_popcnt_u32(w);
00266        w ^= (w >> 1);
00267        count += _mm_popcnt_u32(w);
00268        count -= (w_prev = (w0 >> w_shift));
00269    }
00270 
00271    do
00272    {
00273        __m128i b = _mm_load_si128(block);
00274        __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1)); // tmp2=(b >> 1) ^ b;
00275        __m128i tmp3 = _mm_srli_epi32(b, w_shift); // tmp3 = w0 >> w_shift
00276 //       __m128i tmp4 = _mm_and_si128(b, mask1);    // tmp4 = w0 & 1 
00277 
00278        // ---------------------------------------------------------------------
00279        {
00280            if (first_word)
00281            {
00282                first_word = false;               
00283            }
00284            else
00285            {
00286                w0 = _mm_extract_epi32(b, 0);
00287                if (w0)
00288                {
00289                    *bit_count += _mm_popcnt_u32(w0);
00290                    count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0));
00291                    count -= !(w_prev ^ (w0 & 1));
00292                    count -= w_prev = _mm_extract_epi32(tmp3, 0);
00293                }
00294                else
00295                {
00296                    count -= !w_prev; w_prev ^= w_prev;
00297                }  
00298            }
00299            w0 = _mm_extract_epi32(b, 1);
00300            if (w0)
00301            {
00302                *bit_count += _mm_popcnt_u32(w0);
00303                count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1));
00304                count -= !(w_prev ^ (w0 & 1));
00305                count -= w_prev = _mm_extract_epi32(tmp3, 1);                    
00306            }
00307            else
00308            {
00309                count -= !w_prev; w_prev ^= w_prev;
00310            }  
00311            w0 = _mm_extract_epi32(b, 2);
00312            if (w0)
00313            {
00314                *bit_count += _mm_popcnt_u32(w0);
00315                count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2));
00316                count -= !(w_prev ^ (w0 & 1));
00317                count -= w_prev = _mm_extract_epi32(tmp3, 2);                   
00318            }
00319            else
00320            {
00321                count -= !w_prev; w_prev ^= w_prev;
00322            }  
00323            w0 = _mm_extract_epi32(b, 3);
00324            if (w0)
00325            {
00326                *bit_count += _mm_popcnt_u32(w0);
00327                count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3));
00328                count -= !(w_prev ^ (w0 & 1));
00329                count -= w_prev = _mm_extract_epi32(tmp3, 3);                    
00330            }
00331            else
00332            {
00333                count -= !w_prev; w_prev ^= w_prev;
00334            }               
00335        }
00336    } while (++block < block_end);
00337 
00338    return count;
00339 }
00340 
00341 
00342 
00343 } // namespace
00344 
00345 
00346 
00347 
00348 #endif