dlvhex
2.5.0
|
00001 #ifndef BMSSE4__H__INCLUDED__ 00002 #define BMSSE4__H__INCLUDED__ 00003 /* 00004 Copyright(c) 2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) 00005 00006 Permission is hereby granted, free of charge, to any person 00007 obtaining a copy of this software and associated documentation 00008 files (the "Software"), to deal in the Software without restriction, 00009 including without limitation the rights to use, copy, modify, merge, 00010 publish, distribute, sublicense, and/or sell copies of the Software, 00011 and to permit persons to whom the Software is furnished to do so, 00012 subject to the following conditions: 00013 00014 The above copyright notice and this permission notice shall be included 00015 in all copies or substantial portions of the Software. 00016 00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 00023 OTHER DEALINGS IN THE SOFTWARE. 00024 00025 For more information please visit: http://bmagic.sourceforge.net 00026 00027 */ 00028 00029 00030 00031 // Header implements processor specific intrinsics declarations for SSE2 00032 // instruction set 00033 #include<mmintrin.h> 00034 #include<emmintrin.h> 00035 #include<smmintrin.h> 00036 00037 #include "bmdef.h" 00038 #include "bmsse_util.h" 00039 00040 namespace bm 00041 { 00042 00054 inline 00055 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end) 00056 { 00057 bm::id_t count = 0; 00058 00059 #ifdef BM64_SSE4 00060 do 00061 { 00062 __m128i tmp0 = _mm_load_si128(block); 00063 count += _mm_popcnt_u64(_mm_extract_epi64(tmp0, 0)) + 00064 _mm_popcnt_u64(_mm_extract_epi64(tmp0, 1)); 00065 __m128i tmp1 = _mm_load_si128(block+1); 00066 count += _mm_popcnt_u64(_mm_extract_epi64(tmp1, 0)) + 00067 _mm_popcnt_u64(_mm_extract_epi64(tmp1, 1)); 00068 00069 block +=2; 00070 } while (block < block_end); 00071 00072 #else 00073 do 00074 { 00075 const unsigned* b = (unsigned*) block; 00076 count += _mm_popcnt_u32(b[0]) + 00077 _mm_popcnt_u32(b[1]) + 00078 _mm_popcnt_u32(b[2]) + 00079 _mm_popcnt_u32(b[3]); 00080 } while (++block < block_end); 00081 #endif 00082 return count; 00083 } 00084 00088 BMFORCEINLINE 00089 unsigned op_xor(unsigned a, unsigned b) 00090 { 00091 unsigned ret = (a ^ b); 00092 return ret; 00093 } 00094 00098 BMFORCEINLINE 00099 unsigned op_or(unsigned a, unsigned b) 00100 { 00101 return (a | b); 00102 } 00103 00107 BMFORCEINLINE 00108 unsigned op_and(unsigned a, unsigned b) 00109 { 00110 return (a & b); 00111 } 00112 00113 00114 template<class Func> 00115 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block, 00116 const __m128i* BMRESTRICT block_end, 00117 const __m128i* BMRESTRICT mask_block, 00118 Func sse2_func) 00119 { 00120 bm::id_t count = 0; 00121 #ifdef BM64_SSE4 00122 do 00123 { 00124 __m128i tmp0 = _mm_load_si128(block); 00125 __m128i tmp1 = _mm_load_si128(mask_block); 00126 __m128i b = sse2_func(tmp0, tmp1); 00127 00128 count += _mm_popcnt_u64(_mm_extract_epi64(b, 0)); 00129 count += _mm_popcnt_u64(_mm_extract_epi64(b, 1)); 00130 00131 ++block; ++mask_block; 00132 } while (block < block_end); 00133 #else 00134 do 00135 { 00136 __m128i tmp0 = _mm_load_si128(block); 00137 __m128i tmp1 = _mm_load_si128(mask_block); 00138 __m128i b = sse2_func(tmp0, tmp1); 00139 00140 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0)); 00141 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1)); 00142 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2)); 00143 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3)); 00144 00145 ++block; ++mask_block; 00146 } while (block < block_end); 00147 #endif 00148 00149 return count; 00150 } 00151 00152 /* 00153 template<class Func> 00154 bm::id_t sse4_bit_count_op2(const __m128i* BMRESTRICT block, 00155 const __m128i* BMRESTRICT block_end, 00156 const __m128i* BMRESTRICT mask_block, 00157 Func op_func) 00158 { 00159 bm::id_t count = 0; 00160 #ifdef BM64_SSE4 00161 do 00162 { 00163 unsigned *r1 = (unsigned*) block; 00164 unsigned *r2 = (unsigned*) mask_block; 00165 00166 count += _mm_popcnt_u32(op_func(r1[0], r2[0])); 00167 count += _mm_popcnt_u32(op_func(r1[1], r2[1])); 00168 count += _mm_popcnt_u32(op_func(r1[2], r2[2])); 00169 count += _mm_popcnt_u32(op_func(r1[3], r2[3])); 00170 00171 ++mask_block; 00172 00173 } while (++block < block_end); 00174 #else 00175 do 00176 { 00177 unsigned *r1 = (unsigned*) block; 00178 unsigned *r2 = (unsigned*) mask_block; 00179 00180 count += _mm_popcnt_u32(op_func(r1[0], r2[0])); 00181 count += _mm_popcnt_u32(op_func(r1[1], r2[1])); 00182 count += _mm_popcnt_u32(op_func(r1[2], r2[2])); 00183 count += _mm_popcnt_u32(op_func(r1[3], r2[3])); 00184 00185 ++mask_block; 00186 00187 } while (++block < block_end); 00188 #endif 00189 return count; 00190 00191 } 00192 */ 00193 00194 00195 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\ 00196 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask) 00197 00198 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\ 00199 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask) 00200 00201 #define VECT_BITCOUNT(first, last) \ 00202 sse4_bit_count((__m128i*) (first), (__m128i*) (last)) 00203 00204 #define VECT_BITCOUNT_AND(first, last, mask) \ 00205 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) 00206 00207 #define VECT_BITCOUNT_OR(first, last, mask) \ 00208 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) 00209 00210 #define VECT_BITCOUNT_XOR(first, last, mask) \ 00211 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) 00212 00213 #define VECT_BITCOUNT_SUB(first, last, mask) \ 00214 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) 00215 00216 #define VECT_INVERT_ARR(first, last) \ 00217 sse2_invert_arr(first, last); 00218 00219 #define VECT_AND_ARR(dst, src, src_end) \ 00220 sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00221 00222 #define VECT_OR_ARR(dst, src, src_end) \ 00223 sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00224 00225 #define VECT_SUB_ARR(dst, src, src_end) \ 00226 sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00227 00228 #define VECT_XOR_ARR(dst, src, src_end) \ 00229 sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00230 00231 #define VECT_COPY_BLOCK(dst, src, src_end) \ 00232 sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end)) 00233 00234 #define VECT_SET_BLOCK(dst, dst_end, value) \ 00235 sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value)) 00236 00237 00238 00239 00240 00247 inline 00248 bm::id_t sse4_bit_block_calc_count_change(const __m128i* BMRESTRICT block, 00249 const __m128i* BMRESTRICT block_end, 00250 unsigned* BMRESTRICT bit_count) 00251 { 00252 // __m128i mask1 = _mm_set_epi32(0x1, 0x1, 0x1, 0x1); 00253 register int count = (block_end - block)*4; 00254 00255 register bm::word_t w0, w_prev; 00256 const int w_shift = sizeof(w0) * 8 - 1; 00257 bool first_word = true; 00258 *bit_count = 0; 00259 00260 // first word 00261 { 00262 bm::word_t w; 00263 const bm::word_t* blk = (const bm::word_t*) block; 00264 w = w0 = blk[0]; 00265 *bit_count += _mm_popcnt_u32(w); 00266 w ^= (w >> 1); 00267 count += _mm_popcnt_u32(w); 00268 count -= (w_prev = (w0 >> w_shift)); 00269 } 00270 00271 do 00272 { 00273 __m128i b = _mm_load_si128(block); 00274 __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1)); // tmp2=(b >> 1) ^ b; 00275 __m128i tmp3 = _mm_srli_epi32(b, w_shift); // tmp3 = w0 >> w_shift 00276 // __m128i tmp4 = _mm_and_si128(b, mask1); // tmp4 = w0 & 1 00277 00278 // --------------------------------------------------------------------- 00279 { 00280 if (first_word) 00281 { 00282 first_word = false; 00283 } 00284 else 00285 { 00286 w0 = _mm_extract_epi32(b, 0); 00287 if (w0) 00288 { 00289 *bit_count += _mm_popcnt_u32(w0); 00290 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0)); 00291 count -= !(w_prev ^ (w0 & 1)); 00292 count -= w_prev = _mm_extract_epi32(tmp3, 0); 00293 } 00294 else 00295 { 00296 count -= !w_prev; w_prev ^= w_prev; 00297 } 00298 } 00299 w0 = _mm_extract_epi32(b, 1); 00300 if (w0) 00301 { 00302 *bit_count += _mm_popcnt_u32(w0); 00303 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1)); 00304 count -= !(w_prev ^ (w0 & 1)); 00305 count -= w_prev = _mm_extract_epi32(tmp3, 1); 00306 } 00307 else 00308 { 00309 count -= !w_prev; w_prev ^= w_prev; 00310 } 00311 w0 = _mm_extract_epi32(b, 2); 00312 if (w0) 00313 { 00314 *bit_count += _mm_popcnt_u32(w0); 00315 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2)); 00316 count -= !(w_prev ^ (w0 & 1)); 00317 count -= w_prev = _mm_extract_epi32(tmp3, 2); 00318 } 00319 else 00320 { 00321 count -= !w_prev; w_prev ^= w_prev; 00322 } 00323 w0 = _mm_extract_epi32(b, 3); 00324 if (w0) 00325 { 00326 *bit_count += _mm_popcnt_u32(w0); 00327 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3)); 00328 count -= !(w_prev ^ (w0 & 1)); 00329 count -= w_prev = _mm_extract_epi32(tmp3, 3); 00330 } 00331 else 00332 { 00333 count -= !w_prev; w_prev ^= w_prev; 00334 } 00335 } 00336 } while (++block < block_end); 00337 00338 return count; 00339 } 00340 00341 00342 00343 } // namespace 00344 00345 00346 00347 00348 #endif