dlvhex  2.5.0
vs10/bm/bmsse2.h
Go to the documentation of this file.
00001 #ifndef BMSSE2__H__INCLUDED__
00002 #define BMSSE2__H__INCLUDED__
00003 /*
00004 Copyright(c) 2002-2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
00005 
00006 Permission is hereby granted, free of charge, to any person 
00007 obtaining a copy of this software and associated documentation 
00008 files (the "Software"), to deal in the Software without restriction, 
00009 including without limitation the rights to use, copy, modify, merge, 
00010 publish, distribute, sublicense, and/or sell copies of the Software, 
00011 and to permit persons to whom the Software is furnished to do so, 
00012 subject to the following conditions:
00013 
00014 The above copyright notice and this permission notice shall be included 
00015 in all copies or substantial portions of the Software.
00016 
00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
00023 OTHER DEALINGS IN THE SOFTWARE.
00024 
00025 For more information please visit:  http://bmagic.sourceforge.net
00026 
00027 */
00028 
00029 
00030 
00031 //    Header implements processor specific intrinsics declarations for SSE2
00032 //    instruction set
00033 #include<mmintrin.h>
00034 #include<emmintrin.h>
00035 
00036 #include "bmdef.h"
00037 #include "bmsse_util.h"
00038 
00039 
00040 namespace bm
00041 {
00042 
00043 
00063 inline 
00064 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
00065 {
00066     const unsigned mu1 = 0x55555555;
00067     const unsigned mu2 = 0x33333333;
00068     const unsigned mu3 = 0x0F0F0F0F;
00069     const unsigned mu4 = 0x0000003F;
00070 
00071     // Loading masks
00072     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00073     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00074     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00075     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00076     __m128i mcnt;
00077     mcnt = _mm_xor_si128(m1, m1); // cnt = 0
00078 
00079     __m128i tmp1, tmp2;
00080     do
00081     {        
00082         __m128i b = _mm_load_si128(block);
00083         ++block;
00084 
00085         // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00086         tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00087         tmp1 = _mm_and_si128(tmp1, m1); 
00088         tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00089         b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00090 
00091         // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00092         tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00093         tmp1 = _mm_and_si128(tmp1, m2); 
00094         tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00095         b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00096 
00097         // b = (b + (b >> 4)) & 0x0F0F0F0F;
00098         tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00099         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00100         b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
00101 
00102         // b = b + (b >> 8);
00103         tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00104         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00105 
00106         // b = (b + (b >> 16)) & 0x0000003F;
00107         tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00108         b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00109         b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00110 
00111         mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00112 
00113     } while (block < block_end);
00114 
00115 
00116     bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00117     _mm_store_si128((__m128i*)tcnt, mcnt);
00118 
00119     return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00120 }
00121 
00122 
00123 
00124 template<class Func>
00125 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block, 
00126                            const __m128i* BMRESTRICT block_end,
00127                            const __m128i* BMRESTRICT mask_block,
00128                            Func sse2_func)
00129 {
00130     const unsigned mu1 = 0x55555555;
00131     const unsigned mu2 = 0x33333333;
00132     const unsigned mu3 = 0x0F0F0F0F;
00133     const unsigned mu4 = 0x0000003F;
00134 
00135     // Loading masks
00136     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00137     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00138     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00139     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00140     __m128i mcnt;
00141     mcnt = _mm_xor_si128(m1, m1); // cnt = 0
00142     do
00143     {
00144         __m128i tmp1, tmp2;
00145         __m128i b = _mm_load_si128(block++);
00146 
00147         tmp1 = _mm_load_si128(mask_block++);
00148         
00149         b = sse2_func(b, tmp1);
00150                         
00151         // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00152         tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00153         tmp1 = _mm_and_si128(tmp1, m1); 
00154         tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00155         b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00156 
00157         // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00158         tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00159         tmp1 = _mm_and_si128(tmp1, m2); 
00160         tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00161         b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00162 
00163         // b = (b + (b >> 4)) & 0x0F0F0F0F;
00164         tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00165         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00166         b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
00167 
00168         // b = b + (b >> 8);
00169         tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00170         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00171         
00172         // b = (b + (b >> 16)) & 0x0000003F;
00173         tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00174         b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00175         b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00176 
00177         mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00178 
00179     } while (block < block_end);
00180 
00181     bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00182     _mm_store_si128((__m128i*)tcnt, mcnt);
00183 
00184     return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00185 }
00186 
00187 
00188 
00189 
00190 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00191     sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00192 
00193 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00194     sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00195 
00196 #define VECT_BITCOUNT(first, last) \
00197     sse2_bit_count((__m128i*) (first), (__m128i*) (last)) 
00198 
00199 #define VECT_BITCOUNT_AND(first, last, mask) \
00200     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) 
00201 
00202 #define VECT_BITCOUNT_OR(first, last, mask) \
00203     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) 
00204 
00205 #define VECT_BITCOUNT_XOR(first, last, mask) \
00206     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) 
00207 
00208 #define VECT_BITCOUNT_SUB(first, last, mask) \
00209     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) 
00210 
00211 #define VECT_INVERT_ARR(first, last) \
00212     sse2_invert_arr(first, last);
00213 
00214 #define VECT_AND_ARR(dst, src, src_end) \
00215     sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00216 
00217 #define VECT_OR_ARR(dst, src, src_end) \
00218     sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00219 
00220 #define VECT_SUB_ARR(dst, src, src_end) \
00221     sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00222 
00223 #define VECT_XOR_ARR(dst, src, src_end) \
00224     sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00225 
00226 #define VECT_COPY_BLOCK(dst, src, src_end) \
00227     sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00228 
00229 #define VECT_SET_BLOCK(dst, dst_end, value) \
00230     sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00231 
00232 
00233 
00234 
00235 
00236 inline
00237 bm::id_t sse2_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
00238                                           const __m128i* BMRESTRICT block_end,
00239                                                unsigned* BMRESTRICT bit_count)
00240 {
00241    const unsigned mu1 = 0x55555555;
00242    const unsigned mu2 = 0x33333333;
00243    const unsigned mu3 = 0x0F0F0F0F;
00244    const unsigned mu4 = 0x0000003F;
00245 
00246    // Loading masks
00247    __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00248    __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00249    __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00250    __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00251    __m128i mcnt, ccnt;
00252    mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
00253    ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
00254 
00255    __m128i tmp1, tmp2;
00256 
00257    int count = (block_end - block)*4; //0;//1;
00258 
00259    bm::word_t  w, w0, w_prev;//, w_l;
00260    const int w_shift = sizeof(w) * 8 - 1;
00261    bool first_word = true;
00262  
00263    // first word
00264    {
00265        const bm::word_t* blk = (const bm::word_t*) block;
00266        w = w0 = blk[0];
00267        w ^= (w >> 1);
00268        BM_INCWORD_BITCOUNT(count, w);
00269        count -= (w_prev = (w0 >> w_shift)); // negative value correction
00270    }
00271 
00272    bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00273 
00274    do
00275    {
00276        // compute bit-count
00277        // ---------------------------------------------------------------------
00278        {
00279        __m128i b = _mm_load_si128(block);
00280 
00281        // w ^(w >> 1)
00282        tmp1 = _mm_srli_epi32(b, 1);       // tmp1 = b >> 1
00283        tmp2 = _mm_xor_si128(b, tmp1);     // tmp2 = tmp1 ^ b;
00284        _mm_store_si128((__m128i*)tcnt, tmp2);
00285        
00286 
00287        // compare with zero
00288        {
00289            // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00290            //tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00291            tmp1 = _mm_and_si128(tmp1, m1);
00292            tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00293            b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00294 
00295            // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00296            tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00297            tmp1 = _mm_and_si128(tmp1, m2);
00298            tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00299            b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00300 
00301            // b = (b + (b >> 4)) & 0x0F0F0F0F;
00302            tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00303            b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00304            b = _mm_and_si128(b, m3);                       //& 0x0F0F0F0F
00305 
00306            // b = b + (b >> 8);
00307            tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00308            b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00309 
00310            // b = (b + (b >> 16)) & 0x0000003F;
00311            tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00312            b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00313            b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00314 
00315            mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00316        }
00317 
00318        }
00319        // ---------------------------------------------------------------------
00320        {
00321            //__m128i b = _mm_load_si128(block);
00322 
00323            const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
00324 
00325            if (first_word)
00326            {
00327                first_word = false;
00328            }
00329            else
00330            {
00331                if ((w0=blk[0]))
00332                {
00333                    BM_INCWORD_BITCOUNT(count, tcnt[0]);
00334                    count -= !(w_prev ^ (w0 & 1));
00335                    count -= w_prev = (w0 >> w_shift);
00336                }
00337                else
00338                {
00339                    count -= !w_prev; w_prev ^= w_prev;
00340                }  
00341            }
00342            if ((w0=blk[1]))
00343            {
00344                BM_INCWORD_BITCOUNT(count, tcnt[1]);
00345                count -= !(w_prev ^ (w0 & 1));
00346                count -= w_prev = (w0 >> w_shift);                    
00347            }
00348            else
00349            {
00350                count -= !w_prev; w_prev ^= w_prev;
00351            }    
00352            if ((w0=blk[2]))
00353            {
00354                BM_INCWORD_BITCOUNT(count, tcnt[2]);
00355                count -= !(w_prev ^ (w0 & 1));
00356                count -= w_prev = (w0 >> w_shift);                    
00357            }
00358            else
00359            {
00360                count -= !w_prev; w_prev ^= w_prev;
00361            }      
00362            if ((w0=blk[3]))
00363            {
00364                BM_INCWORD_BITCOUNT(count, tcnt[3]);
00365                count -= !(w_prev ^ (w0 & 1));
00366                count -= w_prev = (w0 >> w_shift);                    
00367            }
00368            else
00369            {
00370                count -= !w_prev; w_prev ^= w_prev;
00371            }               
00372        }
00373    } while (++block < block_end);
00374 
00375    _mm_store_si128((__m128i*)tcnt, mcnt);
00376    *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00377 
00378    return count;
00379 }
00380 
00381 } // namespace
00382 
00383 
00384 
00385 
00386 #endif