dlvhex
2.5.0
|
00001 #ifndef BMSSE_UTIL__H__INCLUDED__ 00002 #define BMSSE_UTIL__H__INCLUDED__ 00003 /* 00004 Copyright(c) 2002-2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) 00005 00006 Permission is hereby granted, free of charge, to any person 00007 obtaining a copy of this software and associated documentation 00008 files (the "Software"), to deal in the Software without restriction, 00009 including without limitation the rights to use, copy, modify, merge, 00010 publish, distribute, sublicense, and/or sell copies of the Software, 00011 and to permit persons to whom the Software is furnished to do so, 00012 subject to the following conditions: 00013 00014 The above copyright notice and this permission notice shall be included 00015 in all copies or substantial portions of the Software. 00016 00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 00023 OTHER DEALINGS IN THE SOFTWARE. 00024 00025 For more information please visit: http://bmagic.sourceforge.net 00026 00027 */ 00028 00029 00030 00031 namespace bm 00032 { 00033 00049 class sse_empty_guard 00050 { 00051 public: 00052 BMFORCEINLINE sse_empty_guard() 00053 { 00054 _mm_empty(); 00055 } 00056 00057 BMFORCEINLINE ~sse_empty_guard() 00058 { 00059 _mm_empty(); 00060 } 00061 }; 00062 00063 00064 00071 BMFORCEINLINE 00072 void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst, 00073 const __m128i* BMRESTRICT src, 00074 const __m128i* BMRESTRICT src_end, 00075 bm::word_t mask) 00076 { 00077 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask); 00078 do 00079 { 00080 __m128i xmm1 = _mm_load_si128(src); 00081 00082 xmm1 = _mm_xor_si128(xmm1, xmm2); 00083 _mm_store_si128(dst, xmm1); 00084 ++dst; 00085 ++src; 00086 00087 } while (src < src_end); 00088 } 00089 00090 00097 BMFORCEINLINE 00098 void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst, 00099 const __m128i* BMRESTRICT src, 00100 const __m128i* BMRESTRICT src_end, 00101 bm::word_t mask) 00102 { 00103 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask); 00104 do 00105 { 00106 //_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA); 00107 //_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA); 00108 00109 __m128i xmm1 = _mm_load_si128(src); 00110 00111 xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2 00112 _mm_store_si128(dst, xmm1); 00113 ++dst; 00114 ++src; 00115 00116 } while (src < src_end); 00117 } 00118 00125 BMFORCEINLINE 00126 void sse2_and_arr(__m128i* BMRESTRICT dst, 00127 const __m128i* BMRESTRICT src, 00128 const __m128i* BMRESTRICT src_end) 00129 { 00130 __m128i xmm1, xmm2; 00131 do 00132 { 00133 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); 00134 00135 xmm1 = _mm_load_si128(src++); 00136 xmm2 = _mm_load_si128(dst); 00137 xmm1 = _mm_and_si128(xmm1, xmm2); 00138 _mm_store_si128(dst++, xmm1); 00139 00140 xmm1 = _mm_load_si128(src++); 00141 xmm2 = _mm_load_si128(dst); 00142 xmm1 = _mm_and_si128(xmm1, xmm2); 00143 _mm_store_si128(dst++, xmm1); 00144 00145 xmm1 = _mm_load_si128(src++); 00146 xmm2 = _mm_load_si128(dst); 00147 xmm1 = _mm_and_si128(xmm1, xmm2); 00148 _mm_store_si128(dst++, xmm1); 00149 00150 xmm1 = _mm_load_si128(src++); 00151 xmm2 = _mm_load_si128(dst); 00152 xmm1 = _mm_and_si128(xmm1, xmm2); 00153 _mm_store_si128(dst++, xmm1); 00154 00155 } while (src < src_end); 00156 00157 } 00158 00159 00166 BMFORCEINLINE 00167 void sse2_or_arr(__m128i* BMRESTRICT dst, 00168 const __m128i* BMRESTRICT src, 00169 const __m128i* BMRESTRICT src_end) 00170 { 00171 __m128i xmm1, xmm2; 00172 do 00173 { 00174 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); 00175 00176 xmm1 = _mm_load_si128(src++); 00177 xmm2 = _mm_load_si128(dst); 00178 xmm1 = _mm_or_si128(xmm1, xmm2); 00179 _mm_store_si128(dst++, xmm1); 00180 00181 xmm1 = _mm_load_si128(src++); 00182 xmm2 = _mm_load_si128(dst); 00183 xmm1 = _mm_or_si128(xmm1, xmm2); 00184 _mm_store_si128(dst++, xmm1); 00185 00186 xmm1 = _mm_load_si128(src++); 00187 xmm2 = _mm_load_si128(dst); 00188 xmm1 = _mm_or_si128(xmm1, xmm2); 00189 _mm_store_si128(dst++, xmm1); 00190 00191 xmm1 = _mm_load_si128(src++); 00192 xmm2 = _mm_load_si128(dst); 00193 xmm1 = _mm_or_si128(xmm1, xmm2); 00194 _mm_store_si128(dst++, xmm1); 00195 00196 } while (src < src_end); 00197 } 00198 00199 00206 BMFORCEINLINE 00207 void sse2_xor_arr(__m128i* BMRESTRICT dst, 00208 const __m128i* BMRESTRICT src, 00209 const __m128i* BMRESTRICT src_end) 00210 { 00211 __m128i xmm1, xmm2; 00212 do 00213 { 00214 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); 00215 00216 xmm1 = _mm_load_si128(src++); 00217 xmm2 = _mm_load_si128(dst); 00218 xmm1 = _mm_xor_si128(xmm1, xmm2); 00219 _mm_store_si128(dst++, xmm1); 00220 00221 xmm1 = _mm_load_si128(src++); 00222 xmm2 = _mm_load_si128(dst); 00223 xmm1 = _mm_xor_si128(xmm1, xmm2); 00224 _mm_store_si128(dst++, xmm1); 00225 00226 xmm1 = _mm_load_si128(src++); 00227 xmm2 = _mm_load_si128(dst); 00228 xmm1 = _mm_xor_si128(xmm1, xmm2); 00229 _mm_store_si128(dst++, xmm1); 00230 00231 xmm1 = _mm_load_si128(src++); 00232 xmm2 = _mm_load_si128(dst); 00233 xmm1 = _mm_xor_si128(xmm1, xmm2); 00234 _mm_store_si128(dst++, xmm1); 00235 00236 } while (src < src_end); 00237 } 00238 00239 00240 00247 BMFORCEINLINE 00248 void sse2_sub_arr(__m128i* BMRESTRICT dst, 00249 const __m128i* BMRESTRICT src, 00250 const __m128i* BMRESTRICT src_end) 00251 { 00252 __m128i xmm1, xmm2; 00253 do 00254 { 00255 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); 00256 00257 xmm1 = _mm_load_si128(src++); 00258 xmm2 = _mm_load_si128(dst); 00259 xmm1 = _mm_andnot_si128(xmm1, xmm2); 00260 _mm_store_si128(dst++, xmm1); 00261 00262 xmm1 = _mm_load_si128(src++); 00263 xmm2 = _mm_load_si128(dst); 00264 xmm1 = _mm_andnot_si128(xmm1, xmm2); 00265 _mm_store_si128(dst++, xmm1); 00266 00267 xmm1 = _mm_load_si128(src++); 00268 xmm2 = _mm_load_si128(dst); 00269 xmm1 = _mm_andnot_si128(xmm1, xmm2); 00270 _mm_store_si128(dst++, xmm1); 00271 00272 xmm1 = _mm_load_si128(src++); 00273 xmm2 = _mm_load_si128(dst); 00274 xmm1 = _mm_andnot_si128(xmm1, xmm2); 00275 _mm_store_si128(dst++, xmm1); 00276 00277 } while (src < src_end); 00278 } 00279 00287 BMFORCEINLINE 00288 void sse2_set_block(__m128i* BMRESTRICT dst, 00289 __m128i* BMRESTRICT dst_end, 00290 bm::word_t value) 00291 { 00292 __m128i xmm0 = _mm_set_epi32 (value, value, value, value); 00293 do 00294 { 00295 _mm_store_si128(dst, xmm0); 00296 /* 00297 _mm_store_si128(dst+1, xmm0); 00298 _mm_store_si128(dst+2, xmm0); 00299 _mm_store_si128(dst+3, xmm0); 00300 00301 _mm_store_si128(dst+4, xmm0); 00302 _mm_store_si128(dst+5, xmm0); 00303 _mm_store_si128(dst+6, xmm0); 00304 _mm_store_si128(dst+7, xmm0); 00305 00306 dst += 8; 00307 */ 00308 } while (++dst < dst_end); 00309 00310 _mm_sfence(); 00311 } 00312 00313 00314 00321 BMFORCEINLINE 00322 void sse2_copy_block(__m128i* BMRESTRICT dst, 00323 const __m128i* BMRESTRICT src, 00324 const __m128i* BMRESTRICT src_end) 00325 { 00326 __m128i xmm0, xmm1, xmm2, xmm3; 00327 do 00328 { 00329 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA); 00330 00331 xmm0 = _mm_load_si128(src+0); 00332 xmm1 = _mm_load_si128(src+1); 00333 xmm2 = _mm_load_si128(src+2); 00334 xmm3 = _mm_load_si128(src+3); 00335 00336 _mm_store_si128(dst+0, xmm0); 00337 _mm_store_si128(dst+1, xmm1); 00338 _mm_store_si128(dst+2, xmm2); 00339 _mm_store_si128(dst+3, xmm3); 00340 00341 xmm0 = _mm_load_si128(src+4); 00342 xmm1 = _mm_load_si128(src+5); 00343 xmm2 = _mm_load_si128(src+6); 00344 xmm3 = _mm_load_si128(src+7); 00345 00346 _mm_store_si128(dst+4, xmm0); 00347 _mm_store_si128(dst+5, xmm1); 00348 _mm_store_si128(dst+6, xmm2); 00349 _mm_store_si128(dst+7, xmm3); 00350 00351 src += 8; 00352 dst += 8; 00353 00354 } while (src < src_end); 00355 } 00356 00365 BMFORCEINLINE 00366 void sse2_invert_arr(bm::word_t* first, bm::word_t* last) 00367 { 00368 __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 00369 0xFFFFFFFF, 0xFFFFFFFF); 00370 __m128i* wrd_ptr = (__m128i*)first; 00371 00372 do 00373 { 00374 _mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA); 00375 00376 __m128i xmm0 = _mm_load_si128(wrd_ptr); 00377 xmm0 = _mm_xor_si128(xmm0, xmm1); 00378 _mm_store_si128(wrd_ptr, xmm0); 00379 ++wrd_ptr; 00380 } while (wrd_ptr < (__m128i*)last); 00381 } 00382 00383 BMFORCEINLINE 00384 __m128i sse2_and(__m128i a, __m128i b) 00385 { 00386 return _mm_and_si128(a, b); 00387 } 00388 00389 BMFORCEINLINE 00390 __m128i sse2_or(__m128i a, __m128i b) 00391 { 00392 return _mm_or_si128(a, b); 00393 } 00394 00395 00396 BMFORCEINLINE 00397 __m128i sse2_xor(__m128i a, __m128i b) 00398 { 00399 return _mm_xor_si128(a, b); 00400 } 00401 00402 BMFORCEINLINE 00403 __m128i sse2_sub(__m128i a, __m128i b) 00404 { 00405 return _mm_andnot_si128(b, a); 00406 } 00407 00408 00409 00410 } // namespace 00411 00412 00413 00414 #endif