dlvhex
2.5.0
|
00001 #ifndef BMRANDOM__H__INCLUDED__ 00002 #define BMRANDOM__H__INCLUDED__ 00003 /* 00004 Copyright(c) 2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com) 00005 00006 Permission is hereby granted, free of charge, to any person 00007 obtaining a copy of this software and associated documentation 00008 files (the "Software"), to deal in the Software without restriction, 00009 including without limitation the rights to use, copy, modify, merge, 00010 publish, distribute, sublicense, and/or sell copies of the Software, 00011 and to permit persons to whom the Software is furnished to do so, 00012 subject to the following conditions: 00013 00014 The above copyright notice and this permission notice shall be included 00015 in all copies or substantial portions of the Software. 00016 00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 00023 OTHER DEALINGS IN THE SOFTWARE. 00024 00025 For more information please visit: http://bmagic.sourceforge.net 00026 00027 */ 00028 00029 #include "bm.h" 00030 #include "bmfunc.h" 00031 #include "bmdef.h" 00032 00033 #include <stdlib.h> 00034 #include <algorithm> 00035 00036 00037 namespace bm 00038 { 00039 00052 template<class BV> 00053 class random_subset 00054 { 00055 public: 00056 random_subset(); 00057 ~random_subset(); 00058 00065 void sample(BV& bv_out, const BV& bv_in, unsigned count); 00066 00067 00068 private: 00069 typedef 00070 typename BV::blocks_manager_type blocks_manager_type; 00071 00072 private: 00073 void get_subset(BV& bv_out, 00074 const BV& bv_in, 00075 unsigned bv_in_count, 00076 unsigned count); 00077 00078 unsigned find_max_block(); 00079 00080 void get_random_subset(bm::word_t* blk_out, 00081 const bm::word_t* blk_src, 00082 unsigned count); 00083 static 00084 unsigned process_word(bm::word_t* blk_out, 00085 const bm::word_t* blk_src, 00086 unsigned i, 00087 unsigned count); 00088 00089 static 00090 void get_random_array(bm::word_t* blk_out, 00091 bm::gap_word_t* bit_list, 00092 unsigned bit_list_size, 00093 unsigned count); 00094 00095 00096 private: 00097 random_subset(const random_subset&); 00098 random_subset& operator=(const random_subset&); 00099 private: 00100 unsigned* block_counts_; 00101 unsigned short* block_bits_take_; 00102 unsigned blocks_; 00103 bm::gap_word_t bit_list_[bm::gap_max_bits]; 00104 unsigned* block_candidates_; 00105 unsigned candidates_count_; 00106 bm::word_t* sub_block_; 00107 }; 00108 00109 00111 00112 00113 00114 template<class BV> 00115 random_subset<BV>::random_subset() 00116 : block_counts_(new unsigned[bm::set_total_blocks]), 00117 block_bits_take_(new bm::gap_word_t[bm::set_total_blocks]), 00118 block_candidates_(new unsigned[bm::set_total_blocks]), 00119 candidates_count_(0), 00120 sub_block_(new bm::word_t[bm::set_block_size]) 00121 { 00122 } 00123 00124 template<class BV> 00125 random_subset<BV>::~random_subset() 00126 { 00127 delete [] block_counts_; 00128 delete [] block_bits_take_; 00129 delete [] block_candidates_; 00130 delete [] sub_block_; 00131 } 00132 00133 template<class BV> 00134 void random_subset<BV>::sample(BV& bv_out, 00135 const BV& bv_in, 00136 unsigned count) 00137 { 00138 if (count == 0) 00139 { 00140 bv_out.clear(true); 00141 return; 00142 } 00143 00144 unsigned bcnt = bv_in.count(); 00145 if (count >= bcnt) 00146 { 00147 bv_out = bv_in; 00148 return; 00149 } 00150 if (count > bcnt/2) 00151 { 00152 // build the complement vector and subtract it 00153 BV tmp_bv; 00154 unsigned delta_count = bcnt - count; 00155 00156 get_subset(tmp_bv, bv_in, bcnt, delta_count); 00157 bv_out = bv_in; 00158 bv_out -= tmp_bv; 00159 bv_out.forget_count(); 00160 return; 00161 } 00162 00163 get_subset(bv_out, bv_in, bcnt, count); 00164 bv_out.forget_count(); 00165 } 00166 00167 template<class BV> 00168 void random_subset<BV>::get_subset(BV& bv_out, 00169 const BV& bv_in, 00170 unsigned bcnt, 00171 unsigned count) 00172 { 00173 bv_out.clear(true); 00174 bv_out.resize(bv_in.size()); 00175 00176 const blocks_manager_type& bman_in = bv_in.get_blocks_manager(); 00177 blocks_manager_type& bman_out = bv_out.get_blocks_manager(); 00178 00179 bm::word_t* tmp_block = bman_out.check_allocate_tempblock(); 00180 candidates_count_ = 0; 00181 00182 00183 // compute bit-counts in all blocks 00184 // 00185 unsigned block_count = blocks_ = bv_in.count_blocks(block_counts_); 00186 for (unsigned i = 0; i <= block_count; ++i) 00187 { 00188 if (block_counts_[i]) 00189 { 00190 float block_percent = 00191 ((float)(block_counts_[i]+1)) / (float)bcnt; 00192 float bits_to_take = ((float)count) * block_percent; 00193 bits_to_take += 0.99f; 00194 00195 unsigned t = (unsigned)bits_to_take; 00196 block_bits_take_[i] = (bm::gap_word_t)t; 00197 00198 if (block_bits_take_[i] == 0) 00199 { 00200 block_bits_take_[i] = 1; 00201 } 00202 else 00203 if (block_bits_take_[i] > block_counts_[i]) 00204 block_bits_take_[i] = (gap_word_t)block_counts_[i]; 00205 00206 BM_ASSERT(bman_in.get_block(i)); 00207 } 00208 else 00209 { 00210 block_bits_take_[i] = 0; 00211 } 00212 } // for i 00213 00214 for (unsigned take_count = 0; count; count -= take_count) 00215 { 00216 unsigned i = find_max_block(); 00217 take_count = block_bits_take_[i]; 00218 if (take_count > count) 00219 take_count = count; 00220 if (take_count == 0) 00221 continue; 00222 const bm::word_t* blk_src = bman_in.get_block(i); 00223 BM_ASSERT(blk_src); 00224 00225 // allocate target block 00226 bm::word_t* blk_out = bman_out.get_block(i); 00227 if (blk_out != 0) 00228 { 00229 blk_out = bman_out.deoptimize_block(i); 00230 } 00231 else 00232 { 00233 blk_out = bman_out.get_allocator().alloc_bit_block(); 00234 bman_out.set_block(i, blk_out); 00235 } 00236 if (take_count == block_counts_[i]) 00237 { 00238 // copy the whole src block 00239 if (BM_IS_GAP(blk_src)) 00240 { 00241 gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src)); 00242 } 00243 else 00244 { 00245 bm::bit_block_copy(blk_out, blk_src); 00246 } 00247 block_bits_take_[i] = 0; // exclude block from any farther picking 00248 continue; 00249 } 00250 bit_block_set(blk_out, 0); 00251 00252 if (block_counts_[i] < 4096) // use array shuffle 00253 { 00254 unsigned arr_len; 00255 // convert source block to bit-block 00256 if (BM_IS_GAP(blk_src)) 00257 { 00258 arr_len = gap_convert_to_arr(bit_list_, 00259 BMGAP_PTR(blk_src), 00260 bm::gap_max_bits); 00261 } 00262 else // bit-block 00263 { 00264 arr_len = bit_convert_to_arr(bit_list_, 00265 blk_src, 00266 bm::gap_max_bits, 00267 bm::gap_max_bits, 00268 0); 00269 } 00270 BM_ASSERT(arr_len); 00271 get_random_array(blk_out, bit_list_, arr_len, take_count); 00272 } 00273 else // dense block 00274 { 00275 // convert source block to bit-block 00276 if (BM_IS_GAP(blk_src)) 00277 { 00278 gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src)); 00279 blk_src = tmp_block; 00280 } 00281 00282 // pick random bits source block to target 00283 get_random_subset(blk_out, blk_src, take_count); 00284 } 00285 00286 block_bits_take_[i] = 0; // exclude block from any farther picking 00287 } 00288 } 00289 00290 template<class BV> 00291 void random_subset<BV>::get_random_subset(bm::word_t* blk_out, 00292 const bm::word_t* blk_src, 00293 unsigned take_count) 00294 { 00295 for (unsigned rounds = 0; take_count && rounds < 10; ++rounds) 00296 { 00297 // pick random scan start and scan direction 00298 // 00299 unsigned i = rand() % bm::set_block_size; 00300 unsigned new_count; 00301 00302 for (; i < bm::set_block_size && take_count; ++i) 00303 { 00304 if (blk_src[i] && (blk_out[i] != blk_src[i])) 00305 { 00306 take_count -= new_count = 00307 process_word(blk_out, blk_src, i, take_count); 00308 } 00309 } // for i 00310 00311 } // for 00312 // if masked scan did not produce enough results.. 00313 // 00314 if (take_count) 00315 { 00316 // Find all vacant bits: do logical (src SUB out) 00317 for (unsigned i = 0; i < bm::set_block_size; ++i) 00318 { 00319 sub_block_[i] = blk_src[i] & ~blk_out[i]; 00320 } 00321 // now transform vacant bits to array, then pick random elements 00322 // 00323 unsigned arr_len = bit_convert_to_arr(bit_list_, 00324 sub_block_, 00325 bm::gap_max_bits, 00326 bm::gap_max_bits, 00327 0); 00328 BM_ASSERT(arr_len); 00329 get_random_array(blk_out, bit_list_, arr_len, take_count); 00330 } 00331 } 00332 00333 template<class BV> 00334 unsigned random_subset<BV>::process_word(bm::word_t* blk_out, 00335 const bm::word_t* blk_src, 00336 unsigned i, 00337 unsigned count) 00338 { 00339 unsigned new_bits, mask; 00340 00341 do 00342 { 00343 mask = rand(); 00344 mask ^= mask << 16; 00345 } while (mask == 0); 00346 00347 unsigned src_rand = blk_src[i] & mask; 00348 new_bits = src_rand & ~blk_out[i]; 00349 00350 if (new_bits) 00351 { 00352 unsigned new_count = bm::word_bitcount(new_bits); 00353 00354 // check if we accidentally picked more bits than needed 00355 if (new_count > count) 00356 { 00357 BM_ASSERT(count); 00358 00359 unsigned char blist[64]; 00360 unsigned arr_size = bm::bit_list_4(new_bits, blist); 00361 BM_ASSERT(arr_size == new_count); 00362 std::random_shuffle(blist, blist + arr_size); 00363 unsigned value = 0; 00364 for (unsigned j = 0; j < count; ++j) 00365 { 00366 value |= (1 << blist[j]); 00367 } 00368 new_bits = value; 00369 new_count = count; 00370 00371 BM_ASSERT(bm::word_bitcount(new_bits) == count); 00372 BM_ASSERT((new_bits & ~blk_src[i]) == 0); 00373 } 00374 00375 blk_out[i] |= new_bits; 00376 return new_count; 00377 } 00378 00379 return 0; 00380 } 00381 00382 00383 template<class BV> 00384 void random_subset<BV>::get_random_array(bm::word_t* blk_out, 00385 bm::gap_word_t* bit_list, 00386 unsigned bit_list_size, 00387 unsigned count) 00388 { 00389 std::random_shuffle(bit_list, bit_list + bit_list_size); 00390 for (unsigned i = 0; i < count; ++i) 00391 { 00392 bm::set_bit(blk_out, bit_list[i]); 00393 } 00394 } 00395 00396 template<class BV> 00397 unsigned random_subset<BV>::find_max_block() 00398 { 00399 if (candidates_count_) 00400 { 00401 return block_candidates_[--candidates_count_]; 00402 } 00403 00404 unsigned candidate = 0; 00405 unsigned max_i = 0; 00406 for (unsigned i = 0; i <= blocks_; ++i) 00407 { 00408 if (block_bits_take_[i] == 0) continue; 00409 if (block_bits_take_[i] == candidate) 00410 { 00411 block_candidates_[candidates_count_++] = i; 00412 } 00413 else 00414 { 00415 unsigned diff = abs((int)block_bits_take_[i] - (int)candidate); 00416 double d = (double)diff / (double)candidate; 00417 00418 if (d < 0.20f) // delta is statistically insignificant 00419 { 00420 block_candidates_[candidates_count_++] = i; 00421 } 00422 else 00423 if (block_bits_take_[i] > candidate) 00424 { 00425 candidate = block_bits_take_[i]; 00426 max_i = i; 00427 candidates_count_ = 0; 00428 block_candidates_[candidates_count_++] = i; 00429 } 00430 } 00431 } 00432 00433 if (candidates_count_ > 1) 00434 { 00435 std::random_shuffle(block_candidates_, block_candidates_ + candidates_count_); 00436 return find_max_block(); 00437 } 00438 return max_i; 00439 } 00440 00441 00442 } // namespace 00443 00444 #include "bmundef.h" 00445 00446 #endif