dlvhex  2.5.0
vs12/bm/bmsse_util.h
Go to the documentation of this file.
00001 #ifndef BMSSE_UTIL__H__INCLUDED__
00002 #define BMSSE_UTIL__H__INCLUDED__
00003 /*
00004 Copyright(c) 2002-2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
00005 
00006 Permission is hereby granted, free of charge, to any person 
00007 obtaining a copy of this software and associated documentation 
00008 files (the "Software"), to deal in the Software without restriction, 
00009 including without limitation the rights to use, copy, modify, merge, 
00010 publish, distribute, sublicense, and/or sell copies of the Software, 
00011 and to permit persons to whom the Software is furnished to do so, 
00012 subject to the following conditions:
00013 
00014 The above copyright notice and this permission notice shall be included 
00015 in all copies or substantial portions of the Software.
00016 
00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
00023 OTHER DEALINGS IN THE SOFTWARE.
00024 
00025 For more information please visit:  http://bmagic.sourceforge.net
00026 
00027 */
00028 
00029 
00030 
00031 namespace bm
00032 {
00033 
00049 class sse_empty_guard
00050 {
00051 public:
00052     BMFORCEINLINE sse_empty_guard() 
00053     {
00054         _mm_empty();
00055     }
00056 
00057     BMFORCEINLINE ~sse_empty_guard() 
00058     {
00059         _mm_empty();
00060     }
00061 };
00062 
00063 
00064 
00071 BMFORCEINLINE 
00072 void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst, 
00073                          const __m128i* BMRESTRICT src, 
00074                          const __m128i* BMRESTRICT src_end,
00075                          bm::word_t mask)
00076 {
00077      __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00078      do
00079      {
00080         __m128i xmm1 = _mm_load_si128(src);
00081 
00082         xmm1 = _mm_xor_si128(xmm1, xmm2);
00083         _mm_store_si128(dst, xmm1);
00084         ++dst;
00085         ++src;
00086 
00087      } while (src < src_end);
00088 }
00089 
00090 
00097 BMFORCEINLINE 
00098 void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst, 
00099                             const __m128i* BMRESTRICT src, 
00100                             const __m128i* BMRESTRICT src_end,
00101                             bm::word_t mask)
00102 {
00103      __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00104      do
00105      {
00106         //_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);
00107         //_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);
00108 
00109         __m128i xmm1 = _mm_load_si128(src);
00110 
00111         xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2 
00112         _mm_store_si128(dst, xmm1);
00113         ++dst;
00114         ++src;
00115 
00116      } while (src < src_end);
00117 }
00118 
00125 BMFORCEINLINE 
00126 void sse2_and_arr(__m128i* BMRESTRICT dst, 
00127                   const __m128i* BMRESTRICT src, 
00128                   const __m128i* BMRESTRICT src_end)
00129 {
00130     __m128i xmm1, xmm2;
00131     do
00132     {
00133         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00134     
00135         xmm1 = _mm_load_si128(src++);
00136         xmm2 = _mm_load_si128(dst);
00137         xmm1 = _mm_and_si128(xmm1, xmm2);
00138         _mm_store_si128(dst++, xmm1);
00139         
00140         xmm1 = _mm_load_si128(src++);
00141         xmm2 = _mm_load_si128(dst);
00142         xmm1 = _mm_and_si128(xmm1, xmm2);
00143         _mm_store_si128(dst++, xmm1);
00144 
00145         xmm1 = _mm_load_si128(src++);
00146         xmm2 = _mm_load_si128(dst);
00147         xmm1 = _mm_and_si128(xmm1, xmm2);
00148         _mm_store_si128(dst++, xmm1);
00149 
00150         xmm1 = _mm_load_si128(src++);
00151         xmm2 = _mm_load_si128(dst);
00152         xmm1 = _mm_and_si128(xmm1, xmm2);
00153         _mm_store_si128(dst++, xmm1);
00154 
00155     } while (src < src_end);
00156 
00157 }
00158 
00159 
00166 BMFORCEINLINE 
00167 void sse2_or_arr(__m128i* BMRESTRICT dst, 
00168                  const __m128i* BMRESTRICT src, 
00169                  const __m128i* BMRESTRICT src_end)
00170 {
00171     __m128i xmm1, xmm2;
00172     do
00173     {
00174         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00175     
00176         xmm1 = _mm_load_si128(src++);
00177         xmm2 = _mm_load_si128(dst);
00178         xmm1 = _mm_or_si128(xmm1, xmm2);
00179         _mm_store_si128(dst++, xmm1);
00180         
00181         xmm1 = _mm_load_si128(src++);
00182         xmm2 = _mm_load_si128(dst);
00183         xmm1 = _mm_or_si128(xmm1, xmm2);
00184         _mm_store_si128(dst++, xmm1);
00185 
00186         xmm1 = _mm_load_si128(src++);
00187         xmm2 = _mm_load_si128(dst);
00188         xmm1 = _mm_or_si128(xmm1, xmm2);
00189         _mm_store_si128(dst++, xmm1);
00190 
00191         xmm1 = _mm_load_si128(src++);
00192         xmm2 = _mm_load_si128(dst);
00193         xmm1 = _mm_or_si128(xmm1, xmm2);
00194         _mm_store_si128(dst++, xmm1);
00195 
00196     } while (src < src_end);
00197 }
00198 
00199 
00206 BMFORCEINLINE 
00207 void sse2_xor_arr(__m128i* BMRESTRICT dst, 
00208                   const __m128i* BMRESTRICT src, 
00209                   const __m128i* BMRESTRICT src_end)
00210 {
00211     __m128i xmm1, xmm2;
00212     do
00213     {
00214         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00215     
00216         xmm1 = _mm_load_si128(src++);
00217         xmm2 = _mm_load_si128(dst);
00218         xmm1 = _mm_xor_si128(xmm1, xmm2);
00219         _mm_store_si128(dst++, xmm1);
00220         
00221         xmm1 = _mm_load_si128(src++);
00222         xmm2 = _mm_load_si128(dst);
00223         xmm1 = _mm_xor_si128(xmm1, xmm2);
00224         _mm_store_si128(dst++, xmm1);
00225 
00226         xmm1 = _mm_load_si128(src++);
00227         xmm2 = _mm_load_si128(dst);
00228         xmm1 = _mm_xor_si128(xmm1, xmm2);
00229         _mm_store_si128(dst++, xmm1);
00230 
00231         xmm1 = _mm_load_si128(src++);
00232         xmm2 = _mm_load_si128(dst);
00233         xmm1 = _mm_xor_si128(xmm1, xmm2);
00234         _mm_store_si128(dst++, xmm1);
00235 
00236     } while (src < src_end);
00237 }
00238 
00239 
00240 
00247 BMFORCEINLINE 
00248 void sse2_sub_arr(__m128i* BMRESTRICT dst, 
00249                  const __m128i* BMRESTRICT src, 
00250                  const __m128i* BMRESTRICT src_end)
00251 {
00252     __m128i xmm1, xmm2;
00253     do
00254     {
00255         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00256     
00257         xmm1 = _mm_load_si128(src++);
00258         xmm2 = _mm_load_si128(dst);
00259         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00260         _mm_store_si128(dst++, xmm1);
00261         
00262         xmm1 = _mm_load_si128(src++);
00263         xmm2 = _mm_load_si128(dst);
00264         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00265         _mm_store_si128(dst++, xmm1);
00266 
00267         xmm1 = _mm_load_si128(src++);
00268         xmm2 = _mm_load_si128(dst);
00269         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00270         _mm_store_si128(dst++, xmm1);
00271 
00272         xmm1 = _mm_load_si128(src++);
00273         xmm2 = _mm_load_si128(dst);
00274         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00275         _mm_store_si128(dst++, xmm1);
00276 
00277     } while (src < src_end);    
00278 }
00279 
00287 BMFORCEINLINE 
00288 void sse2_set_block(__m128i* BMRESTRICT dst, 
00289                     __m128i* BMRESTRICT dst_end, 
00290                     bm::word_t value)
00291 {
00292     __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
00293     do
00294     {            
00295         _mm_store_si128(dst, xmm0);
00296 /*        
00297         _mm_store_si128(dst+1, xmm0);
00298         _mm_store_si128(dst+2, xmm0);
00299         _mm_store_si128(dst+3, xmm0);
00300 
00301         _mm_store_si128(dst+4, xmm0);
00302         _mm_store_si128(dst+5, xmm0);
00303         _mm_store_si128(dst+6, xmm0);
00304         _mm_store_si128(dst+7, xmm0);
00305 
00306         dst += 8;
00307 */        
00308     } while (++dst < dst_end);
00309     
00310     _mm_sfence();
00311 }
00312 
00313 
00314 
00321 BMFORCEINLINE 
00322 void sse2_copy_block(__m128i* BMRESTRICT dst, 
00323                      const __m128i* BMRESTRICT src, 
00324                      const __m128i* BMRESTRICT src_end)
00325 {
00326     __m128i xmm0, xmm1, xmm2, xmm3;
00327     do
00328     {
00329         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00330     
00331         xmm0 = _mm_load_si128(src+0);
00332         xmm1 = _mm_load_si128(src+1);
00333         xmm2 = _mm_load_si128(src+2);
00334         xmm3 = _mm_load_si128(src+3);
00335         
00336         _mm_store_si128(dst+0, xmm0);
00337         _mm_store_si128(dst+1, xmm1);
00338         _mm_store_si128(dst+2, xmm2);
00339         _mm_store_si128(dst+3, xmm3);
00340         
00341         xmm0 = _mm_load_si128(src+4);
00342         xmm1 = _mm_load_si128(src+5);
00343         xmm2 = _mm_load_si128(src+6);
00344         xmm3 = _mm_load_si128(src+7);
00345         
00346         _mm_store_si128(dst+4, xmm0);
00347         _mm_store_si128(dst+5, xmm1);
00348         _mm_store_si128(dst+6, xmm2);
00349         _mm_store_si128(dst+7, xmm3);
00350         
00351         src += 8;
00352         dst += 8;
00353         
00354     } while (src < src_end);    
00355 }
00356 
00365 BMFORCEINLINE 
00366 void sse2_invert_arr(bm::word_t* first, bm::word_t* last)
00367 {
00368     __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 
00369                                  0xFFFFFFFF, 0xFFFFFFFF);
00370     __m128i* wrd_ptr = (__m128i*)first;
00371 
00372     do 
00373     {
00374         _mm_prefetch((const char*)(wrd_ptr)+512,  _MM_HINT_NTA);
00375         
00376         __m128i xmm0 = _mm_load_si128(wrd_ptr);
00377         xmm0 = _mm_xor_si128(xmm0, xmm1);
00378         _mm_store_si128(wrd_ptr, xmm0);
00379         ++wrd_ptr;
00380     } while (wrd_ptr < (__m128i*)last);
00381 }
00382 
00383 BMFORCEINLINE 
00384 __m128i sse2_and(__m128i a, __m128i b)
00385 {
00386     return _mm_and_si128(a, b);
00387 }
00388 
00389 BMFORCEINLINE 
00390 __m128i sse2_or(__m128i a, __m128i b)
00391 {
00392     return _mm_or_si128(a, b);
00393 }
00394 
00395 
00396 BMFORCEINLINE 
00397 __m128i sse2_xor(__m128i a, __m128i b)
00398 {
00399     return _mm_xor_si128(a, b);
00400 }
00401 
00402 BMFORCEINLINE 
00403 __m128i sse2_sub(__m128i a, __m128i b)
00404 {
00405     return _mm_andnot_si128(b, a);
00406 }
00407 
00408 
00409 
00410 } // namespace
00411 
00412 
00413 
00414 #endif