dlvhex
2.5.0
|
00001 #ifndef BMSSE2__H__INCLUDED__ 00002 #define BMSSE2__H__INCLUDED__ 00003 /* 00004 Copyright(c) 2002-2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) 00005 00006 Permission is hereby granted, free of charge, to any person 00007 obtaining a copy of this software and associated documentation 00008 files (the "Software"), to deal in the Software without restriction, 00009 including without limitation the rights to use, copy, modify, merge, 00010 publish, distribute, sublicense, and/or sell copies of the Software, 00011 and to permit persons to whom the Software is furnished to do so, 00012 subject to the following conditions: 00013 00014 The above copyright notice and this permission notice shall be included 00015 in all copies or substantial portions of the Software. 00016 00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 00023 OTHER DEALINGS IN THE SOFTWARE. 00024 00025 For more information please visit: http://bmagic.sourceforge.net 00026 00027 */ 00028 00029 00030 00031 // Header implements processor specific intrinsics declarations for SSE2 00032 // instruction set 00033 #include<mmintrin.h> 00034 #include<emmintrin.h> 00035 00036 #include "bmdef.h" 00037 #include "bmsse_util.h" 00038 00039 00040 namespace bm 00041 { 00042 00043 00063 inline 00064 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end) 00065 { 00066 const unsigned mu1 = 0x55555555; 00067 const unsigned mu2 = 0x33333333; 00068 const unsigned mu3 = 0x0F0F0F0F; 00069 const unsigned mu4 = 0x0000003F; 00070 00071 // Loading masks 00072 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1); 00073 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2); 00074 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3); 00075 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4); 00076 __m128i mcnt; 00077 mcnt = _mm_xor_si128(m1, m1); // cnt = 0 00078 00079 __m128i tmp1, tmp2; 00080 do 00081 { 00082 __m128i b = _mm_load_si128(block); 00083 ++block; 00084 00085 // b = (b & 0x55555555) + (b >> 1 & 0x55555555); 00086 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555) 00087 tmp1 = _mm_and_si128(tmp1, m1); 00088 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555) 00089 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 00090 00091 // b = (b & 0x33333333) + (b >> 2 & 0x33333333); 00092 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333) 00093 tmp1 = _mm_and_si128(tmp1, m2); 00094 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333) 00095 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 00096 00097 // b = (b + (b >> 4)) & 0x0F0F0F0F; 00098 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4 00099 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4) 00100 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F 00101 00102 // b = b + (b >> 8); 00103 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8 00104 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8) 00105 00106 // b = (b + (b >> 16)) & 0x0000003F; 00107 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16 00108 b = _mm_add_epi32(b, tmp1); // b + (b >> 16) 00109 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F; 00110 00111 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b 00112 00113 } while (block < block_end); 00114 00115 00116 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR; 00117 _mm_store_si128((__m128i*)tcnt, mcnt); 00118 00119 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3]; 00120 } 00121 00122 00123 00124 template<class Func> 00125 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block, 00126 const __m128i* BMRESTRICT block_end, 00127 const __m128i* BMRESTRICT mask_block, 00128 Func sse2_func) 00129 { 00130 const unsigned mu1 = 0x55555555; 00131 const unsigned mu2 = 0x33333333; 00132 const unsigned mu3 = 0x0F0F0F0F; 00133 const unsigned mu4 = 0x0000003F; 00134 00135 // Loading masks 00136 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1); 00137 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2); 00138 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3); 00139 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4); 00140 __m128i mcnt; 00141 mcnt = _mm_xor_si128(m1, m1); // cnt = 0 00142 do 00143 { 00144 __m128i tmp1, tmp2; 00145 __m128i b = _mm_load_si128(block++); 00146 00147 tmp1 = _mm_load_si128(mask_block++); 00148 00149 b = sse2_func(b, tmp1); 00150 00151 // b = (b & 0x55555555) + (b >> 1 & 0x55555555); 00152 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555) 00153 tmp1 = _mm_and_si128(tmp1, m1); 00154 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555) 00155 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 00156 00157 // b = (b & 0x33333333) + (b >> 2 & 0x33333333); 00158 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333) 00159 tmp1 = _mm_and_si128(tmp1, m2); 00160 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333) 00161 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 00162 00163 // b = (b + (b >> 4)) & 0x0F0F0F0F; 00164 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4 00165 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4) 00166 b = _mm_and_si128(b, m3); // & 0x0F0F0F0F 00167 00168 // b = b + (b >> 8); 00169 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8 00170 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8) 00171 00172 // b = (b + (b >> 16)) & 0x0000003F; 00173 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16 00174 b = _mm_add_epi32(b, tmp1); // b + (b >> 16) 00175 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F; 00176 00177 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b 00178 00179 } while (block < block_end); 00180 00181 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR; 00182 _mm_store_si128((__m128i*)tcnt, mcnt); 00183 00184 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3]; 00185 } 00186 00187 00188 00189 00190 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 00191 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask) 00192 00193 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 00194 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask) 00195 00196 #define VECT_BITCOUNT(first, last) \ 00197 sse2_bit_count((__m128i*) (first), (__m128i*) (last)) 00198 00199 #define VECT_BITCOUNT_AND(first, last, mask) \ 00200 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) 00201 00202 #define VECT_BITCOUNT_OR(first, last, mask) \ 00203 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) 00204 00205 #define VECT_BITCOUNT_XOR(first, last, mask) \ 00206 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) 00207 00208 #define VECT_BITCOUNT_SUB(first, last, mask) \ 00209 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) 00210 00211 #define VECT_INVERT_ARR(first, last) \ 00212 sse2_invert_arr(first, last); 00213 00214 #define VECT_AND_ARR(dst, src, src_end) \ 00215 sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00216 00217 #define VECT_OR_ARR(dst, src, src_end) \ 00218 sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00219 00220 #define VECT_SUB_ARR(dst, src, src_end) \ 00221 sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00222 00223 #define VECT_XOR_ARR(dst, src, src_end) \ 00224 sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00225 00226 #define VECT_COPY_BLOCK(dst, src, src_end) \ 00227 sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00228 00229 #define VECT_SET_BLOCK(dst, dst_end, value) \ 00230 sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value)) 00231 00232 00233 00234 00235 00236 inline 00237 bm::id_t sse2_bit_block_calc_count_change(const __m128i* BMRESTRICT block, 00238 const __m128i* BMRESTRICT block_end, 00239 unsigned* BMRESTRICT bit_count) 00240 { 00241 const unsigned mu1 = 0x55555555; 00242 const unsigned mu2 = 0x33333333; 00243 const unsigned mu3 = 0x0F0F0F0F; 00244 const unsigned mu4 = 0x0000003F; 00245 00246 // Loading masks 00247 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1); 00248 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2); 00249 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3); 00250 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4); 00251 __m128i mcnt, ccnt; 00252 mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0 00253 ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0 00254 00255 __m128i tmp1, tmp2; 00256 00257 int count = (block_end - block)*4; //0;//1; 00258 00259 bm::word_t w, w0, w_prev;//, w_l; 00260 const int w_shift = sizeof(w) * 8 - 1; 00261 bool first_word = true; 00262 00263 // first word 00264 { 00265 const bm::word_t* blk = (const bm::word_t*) block; 00266 w = w0 = blk[0]; 00267 w ^= (w >> 1); 00268 BM_INCWORD_BITCOUNT(count, w); 00269 count -= (w_prev = (w0 >> w_shift)); // negative value correction 00270 } 00271 00272 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR; 00273 00274 do 00275 { 00276 // compute bit-count 00277 // --------------------------------------------------------------------- 00278 { 00279 __m128i b = _mm_load_si128(block); 00280 00281 // w ^(w >> 1) 00282 tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1 00283 tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b; 00284 _mm_store_si128((__m128i*)tcnt, tmp2); 00285 00286 00287 // compare with zero 00288 { 00289 // b = (b & 0x55555555) + (b >> 1 & 0x55555555); 00290 //tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555) 00291 tmp1 = _mm_and_si128(tmp1, m1); 00292 tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555) 00293 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 00294 00295 // b = (b & 0x33333333) + (b >> 2 & 0x33333333); 00296 tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333) 00297 tmp1 = _mm_and_si128(tmp1, m2); 00298 tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333) 00299 b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2 00300 00301 // b = (b + (b >> 4)) & 0x0F0F0F0F; 00302 tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4 00303 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4) 00304 b = _mm_and_si128(b, m3); //& 0x0F0F0F0F 00305 00306 // b = b + (b >> 8); 00307 tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8 00308 b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8) 00309 00310 // b = (b + (b >> 16)) & 0x0000003F; 00311 tmp1 = _mm_srli_epi32 (b, 16); // b >> 16 00312 b = _mm_add_epi32(b, tmp1); // b + (b >> 16) 00313 b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F; 00314 00315 mcnt = _mm_add_epi32(mcnt, b); // mcnt += b 00316 } 00317 00318 } 00319 // --------------------------------------------------------------------- 00320 { 00321 //__m128i b = _mm_load_si128(block); 00322 00323 const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block; 00324 00325 if (first_word) 00326 { 00327 first_word = false; 00328 } 00329 else 00330 { 00331 if ((w0=blk[0])) 00332 { 00333 BM_INCWORD_BITCOUNT(count, tcnt[0]); 00334 count -= !(w_prev ^ (w0 & 1)); 00335 count -= w_prev = (w0 >> w_shift); 00336 } 00337 else 00338 { 00339 count -= !w_prev; w_prev ^= w_prev; 00340 } 00341 } 00342 if ((w0=blk[1])) 00343 { 00344 BM_INCWORD_BITCOUNT(count, tcnt[1]); 00345 count -= !(w_prev ^ (w0 & 1)); 00346 count -= w_prev = (w0 >> w_shift); 00347 } 00348 else 00349 { 00350 count -= !w_prev; w_prev ^= w_prev; 00351 } 00352 if ((w0=blk[2])) 00353 { 00354 BM_INCWORD_BITCOUNT(count, tcnt[2]); 00355 count -= !(w_prev ^ (w0 & 1)); 00356 count -= w_prev = (w0 >> w_shift); 00357 } 00358 else 00359 { 00360 count -= !w_prev; w_prev ^= w_prev; 00361 } 00362 if ((w0=blk[3])) 00363 { 00364 BM_INCWORD_BITCOUNT(count, tcnt[3]); 00365 count -= !(w_prev ^ (w0 & 1)); 00366 count -= w_prev = (w0 >> w_shift); 00367 } 00368 else 00369 { 00370 count -= !w_prev; w_prev ^= w_prev; 00371 } 00372 } 00373 } while (++block < block_end); 00374 00375 _mm_store_si128((__m128i*)tcnt, mcnt); 00376 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3]; 00377 00378 return count; 00379 } 00380 00381 } // namespace 00382 00383 00384 00385 00386 #endif