dlvhex/doc2x/vs10_2bm_2bmsse2_8h_source.html

00001 #ifndef BMSSE2__H__INCLUDED__
00002 #define BMSSE2__H__INCLUDED__
00003 /*
00004 Copyright(c) 2002-2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
00005
00006 Permission is hereby granted, free of charge, to any person
00007 obtaining a copy of this software and associated documentation
00008 files (the "Software"), to deal in the Software without restriction,
00009 including without limitation the rights to use, copy, modify, merge,
00010 publish, distribute, sublicense, and/or sell copies of the Software,
00011 and to permit persons to whom the Software is furnished to do so,
00012 subject to the following conditions:
00013
00014 The above copyright notice and this permission notice shall be included
00015 in all copies or substantial portions of the Software.
00016
00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
00023 OTHER DEALINGS IN THE SOFTWARE.
00024
00025 For more information please visit:  http://bmagic.sourceforge.net
00026
00027 */
00028
00029
00030
00031 //    Header implements processor specific intrinsics declarations for SSE2
00032 //    instruction set
00033 #include<mmintrin.h>
00034 #include<emmintrin.h>
00035
00036 #include "bmdef.h"
00037 #include "bmsse_util.h"
00038
00039
00040 namespace bm
00041 {
00042
00043
00063 inline
00064 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
00065 {
00066     const unsigned mu1 = 0x55555555;
00067     const unsigned mu2 = 0x33333333;
00068     const unsigned mu3 = 0x0F0F0F0F;
00069     const unsigned mu4 = 0x0000003F;
00070
00071     // Loading masks
00072     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00073     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00074     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00075     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00076     __m128i mcnt;
00077     mcnt = _mm_xor_si128(m1, m1); // cnt = 0
00078
00079     __m128i tmp1, tmp2;
00080     do
00081     {
00082         __m128i b = _mm_load_si128(block);
00083         ++block;
00084
00085         // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00086         tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00087         tmp1 = _mm_and_si128(tmp1, m1);
00088         tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00089         b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00090
00091         // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00092         tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00093         tmp1 = _mm_and_si128(tmp1, m2);
00094         tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00095         b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00096
00097         // b = (b + (b >> 4)) & 0x0F0F0F0F;
00098         tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00099         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00100         b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
00101
00102         // b = b + (b >> 8);
00103         tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00104         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00105
00106         // b = (b + (b >> 16)) & 0x0000003F;
00107         tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00108         b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00109         b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00110
00111         mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00112
00113     } while (block < block_end);
00114
00115
00116     bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00117     _mm_store_si128((__m128i*)tcnt, mcnt);
00118
00119     return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00120 }
00121
00122
00123
00124 template<class Func>
00125 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
00126                            const __m128i* BMRESTRICT block_end,
00127                            const __m128i* BMRESTRICT mask_block,
00128                            Func sse2_func)
00129 {
00130     const unsigned mu1 = 0x55555555;
00131     const unsigned mu2 = 0x33333333;
00132     const unsigned mu3 = 0x0F0F0F0F;
00133     const unsigned mu4 = 0x0000003F;
00134
00135     // Loading masks
00136     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00137     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00138     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00139     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00140     __m128i mcnt;
00141     mcnt = _mm_xor_si128(m1, m1); // cnt = 0
00142     do
00143     {
00144         __m128i tmp1, tmp2;
00145         __m128i b = _mm_load_si128(block++);
00146
00147         tmp1 = _mm_load_si128(mask_block++);
00148
00149         b = sse2_func(b, tmp1);
00150
00151         // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00152         tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00153         tmp1 = _mm_and_si128(tmp1, m1);
00154         tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00155         b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00156
00157         // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00158         tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00159         tmp1 = _mm_and_si128(tmp1, m2);
00160         tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00161         b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00162
00163         // b = (b + (b >> 4)) & 0x0F0F0F0F;
00164         tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00165         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00166         b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
00167
00168         // b = b + (b >> 8);
00169         tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00170         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00171
00172         // b = (b + (b >> 16)) & 0x0000003F;
00173         tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00174         b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00175         b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00176
00177         mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00178
00179     } while (block < block_end);
00180
00181     bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00182     _mm_store_si128((__m128i*)tcnt, mcnt);
00183
00184     return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00185 }
00186
00187
00188
00189
00190 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00191     sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00192
00193 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00194     sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00195
00196 #define VECT_BITCOUNT(first, last) \
00197     sse2_bit_count((__m128i*) (first), (__m128i*) (last))
00198
00199 #define VECT_BITCOUNT_AND(first, last, mask) \
00200     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
00201
00202 #define VECT_BITCOUNT_OR(first, last, mask) \
00203     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
00204
00205 #define VECT_BITCOUNT_XOR(first, last, mask) \
00206     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
00207
00208 #define VECT_BITCOUNT_SUB(first, last, mask) \
00209     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
00210
00211 #define VECT_INVERT_ARR(first, last) \
00212     sse2_invert_arr(first, last);
00213
00214 #define VECT_AND_ARR(dst, src, src_end) \
00215     sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00216
00217 #define VECT_OR_ARR(dst, src, src_end) \
00218     sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00219
00220 #define VECT_SUB_ARR(dst, src, src_end) \
00221     sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00222
00223 #define VECT_XOR_ARR(dst, src, src_end) \
00224     sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00225
00226 #define VECT_COPY_BLOCK(dst, src, src_end) \
00227     sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00228
00229 #define VECT_SET_BLOCK(dst, dst_end, value) \
00230     sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00231
00232
00233
00234
00235
00236 inline
00237 bm::id_t sse2_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
00238                                           const __m128i* BMRESTRICT block_end,
00239                                                unsigned* BMRESTRICT bit_count)
00240 {
00241    const unsigned mu1 = 0x55555555;
00242    const unsigned mu2 = 0x33333333;
00243    const unsigned mu3 = 0x0F0F0F0F;
00244    const unsigned mu4 = 0x0000003F;
00245
00246    // Loading masks
00247    __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00248    __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00249    __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00250    __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00251    __m128i mcnt, ccnt;
00252    mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
00253    ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
00254
00255    __m128i tmp1, tmp2;
00256
00257    int count = (block_end - block)*4; //0;//1;
00258
00259    bm::word_t  w, w0, w_prev;//, w_l;
00260    const int w_shift = sizeof(w) * 8 - 1;
00261    bool first_word = true;
00262
00263    // first word
00264    {
00265        const bm::word_t* blk = (const bm::word_t*) block;
00266        w = w0 = blk[0];
00267        w ^= (w >> 1);
00268        BM_INCWORD_BITCOUNT(count, w);
00269        count -= (w_prev = (w0 >> w_shift)); // negative value correction
00270    }
00271
00272    bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00273
00274    do
00275    {
00276        // compute bit-count
00277        // ---------------------------------------------------------------------
00278        {
00279        __m128i b = _mm_load_si128(block);
00280
00281        // w ^(w >> 1)
00282        tmp1 = _mm_srli_epi32(b, 1);       // tmp1 = b >> 1
00283        tmp2 = _mm_xor_si128(b, tmp1);     // tmp2 = tmp1 ^ b;
00284        _mm_store_si128((__m128i*)tcnt, tmp2);
00285
00286
00287        // compare with zero
00288        {
00289            // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00290            //tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00291            tmp1 = _mm_and_si128(tmp1, m1);
00292            tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00293            b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00294
00295            // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00296            tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00297            tmp1 = _mm_and_si128(tmp1, m2);
00298            tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00299            b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00300
00301            // b = (b + (b >> 4)) & 0x0F0F0F0F;
00302            tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00303            b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00304            b = _mm_and_si128(b, m3);                       //& 0x0F0F0F0F
00305
00306            // b = b + (b >> 8);
00307            tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00308            b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00309
00310            // b = (b + (b >> 16)) & 0x0000003F;
00311            tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00312            b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00313            b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00314
00315            mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00316        }
00317
00318        }
00319        // ---------------------------------------------------------------------
00320        {
00321            //__m128i b = _mm_load_si128(block);
00322
00323            const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
00324
00325            if (first_word)
00326            {
00327                first_word = false;
00328            }
00329            else
00330            {
00331                if ((w0=blk[0]))
00332                {
00333                    BM_INCWORD_BITCOUNT(count, tcnt[0]);
00334                    count -= !(w_prev ^ (w0 & 1));
00335                    count -= w_prev = (w0 >> w_shift);
00336                }
00337                else
00338                {
00339                    count -= !w_prev; w_prev ^= w_prev;
00340                }
00341            }
00342            if ((w0=blk[1]))
00343            {
00344                BM_INCWORD_BITCOUNT(count, tcnt[1]);
00345                count -= !(w_prev ^ (w0 & 1));
00346                count -= w_prev = (w0 >> w_shift);
00347            }
00348            else
00349            {
00350                count -= !w_prev; w_prev ^= w_prev;
00351            }
00352            if ((w0=blk[2]))
00353            {
00354                BM_INCWORD_BITCOUNT(count, tcnt[2]);
00355                count -= !(w_prev ^ (w0 & 1));
00356                count -= w_prev = (w0 >> w_shift);
00357            }
00358            else
00359            {
00360                count -= !w_prev; w_prev ^= w_prev;
00361            }
00362            if ((w0=blk[3]))
00363            {
00364                BM_INCWORD_BITCOUNT(count, tcnt[3]);
00365                count -= !(w_prev ^ (w0 & 1));
00366                count -= w_prev = (w0 >> w_shift);
00367            }
00368            else
00369            {
00370                count -= !w_prev; w_prev ^= w_prev;
00371            }
00372        }
00373    } while (++block < block_end);
00374
00375    _mm_store_si128((__m128i*)tcnt, mcnt);
00376    *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00377
00378    return count;
00379 }
00380
00381 } // namespace
00382
00383
00384
00385
00386 #endif