dlvhex  2.5.0
vs10/bm/bmrandom.h
Go to the documentation of this file.
00001 #ifndef BMRANDOM__H__INCLUDED__
00002 #define BMRANDOM__H__INCLUDED__
00003 /*
00004 Copyright(c) 2009 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
00005 
00006 Permission is hereby granted, free of charge, to any person 
00007 obtaining a copy of this software and associated documentation 
00008 files (the "Software"), to deal in the Software without restriction, 
00009 including without limitation the rights to use, copy, modify, merge, 
00010 publish, distribute, sublicense, and/or sell copies of the Software, 
00011 and to permit persons to whom the Software is furnished to do so, 
00012 subject to the following conditions:
00013 
00014 The above copyright notice and this permission notice shall be included 
00015 in all copies or substantial portions of the Software.
00016 
00017 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
00018 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
00019 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
00020 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
00021 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
00022 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
00023 OTHER DEALINGS IN THE SOFTWARE.
00024 
00025 For more information please visit:  http://bmagic.sourceforge.net
00026 
00027 */
00028 
00029 #include "bm.h"
00030 #include "bmfunc.h"
00031 #include "bmdef.h"
00032 
00033 #include <stdlib.h>
00034 #include <algorithm>
00035 
00036 
00037 namespace bm
00038 {
00039 
00052 template<class BV>
00053 class random_subset
00054 {
00055 public:
00056     random_subset();
00057     ~random_subset();
00058 
00065     void sample(BV& bv_out, const BV& bv_in, unsigned count);
00066     
00067 
00068 private:
00069     typedef 
00070         typename BV::blocks_manager_type  blocks_manager_type;
00071 
00072 private:
00073     void get_subset(BV& bv_out, 
00074                     const BV& bv_in, 
00075                     unsigned  bv_in_count,
00076                     unsigned count);
00077 
00078     unsigned find_max_block();
00079     
00080     void get_random_subset(bm::word_t*       blk_out, 
00081                            const bm::word_t* blk_src,
00082                            unsigned          count);
00083     static 
00084     unsigned process_word(bm::word_t*       blk_out, 
00085                           const bm::word_t* blk_src,
00086                           unsigned          i,
00087                           unsigned          count);
00088 
00089     static
00090     void get_random_array(bm::word_t*       blk_out, 
00091                           bm::gap_word_t*   bit_list,
00092                           unsigned          bit_list_size,
00093                           unsigned          count);
00094 
00095 
00096 private:
00097     random_subset(const random_subset&);
00098     random_subset& operator=(const random_subset&);
00099 private:
00100     unsigned*         block_counts_; 
00101     unsigned short*   block_bits_take_;
00102     unsigned          blocks_;
00103     bm::gap_word_t    bit_list_[bm::gap_max_bits];
00104     unsigned*         block_candidates_;
00105     unsigned          candidates_count_;
00106     bm::word_t*       sub_block_;
00107 };
00108 
00109 
00111 
00112 
00113 
00114 template<class BV>
00115 random_subset<BV>::random_subset()
00116 : block_counts_(new unsigned[bm::set_total_blocks]),
00117   block_bits_take_(new bm::gap_word_t[bm::set_total_blocks]),
00118   block_candidates_(new unsigned[bm::set_total_blocks]),
00119   candidates_count_(0),
00120   sub_block_(new bm::word_t[bm::set_block_size])
00121 {
00122 }
00123 
00124 template<class BV>
00125 random_subset<BV>::~random_subset()
00126 {
00127     delete [] block_counts_;
00128     delete [] block_bits_take_;
00129     delete [] block_candidates_;
00130     delete [] sub_block_;
00131 }
00132 
00133 template<class BV>
00134 void random_subset<BV>::sample(BV&       bv_out, 
00135                                const BV& bv_in, 
00136                                unsigned  count)
00137 {
00138     if (count == 0)
00139     {
00140         bv_out.clear(true);
00141         return;
00142     }
00143 
00144     unsigned bcnt = bv_in.count();
00145     if (count >= bcnt)
00146     {
00147         bv_out = bv_in;
00148         return;
00149     }
00150     if (count > bcnt/2) 
00151     {
00152         // build the complement vector and subtract it
00153         BV tmp_bv;
00154         unsigned delta_count = bcnt - count;
00155 
00156         get_subset(tmp_bv, bv_in, bcnt, delta_count);
00157         bv_out = bv_in;
00158         bv_out -= tmp_bv;
00159         bv_out.forget_count();
00160         return;
00161     }
00162 
00163     get_subset(bv_out, bv_in, bcnt, count);
00164     bv_out.forget_count();
00165 }
00166 
00167 template<class BV>
00168 void random_subset<BV>::get_subset(BV&       bv_out, 
00169                                    const BV& bv_in, 
00170                                    unsigned  bcnt,
00171                                    unsigned  count)
00172 {
00173     bv_out.clear(true);
00174     bv_out.resize(bv_in.size());
00175 
00176     const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
00177     blocks_manager_type& bman_out = bv_out.get_blocks_manager();
00178 
00179     bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
00180     candidates_count_ = 0;
00181 
00182 
00183     // compute bit-counts in all blocks
00184     //
00185     unsigned block_count = blocks_ = bv_in.count_blocks(block_counts_);
00186     for (unsigned i = 0; i <= block_count; ++i)
00187     {
00188         if (block_counts_[i])
00189         {
00190             float block_percent = 
00191                 ((float)(block_counts_[i]+1)) / (float)bcnt;
00192             float bits_to_take = ((float)count) * block_percent; 
00193             bits_to_take += 0.99f;
00194 
00195             unsigned t = (unsigned)bits_to_take;
00196             block_bits_take_[i] = (bm::gap_word_t)t;
00197             
00198             if (block_bits_take_[i] == 0)
00199             {
00200                 block_bits_take_[i] = 1;
00201             }
00202             else
00203             if (block_bits_take_[i] > block_counts_[i])
00204                 block_bits_take_[i] = (gap_word_t)block_counts_[i];
00205 
00206             BM_ASSERT(bman_in.get_block(i));
00207         }
00208         else
00209         {
00210             block_bits_take_[i] = 0;
00211         }
00212     } // for i
00213 
00214     for (unsigned take_count = 0; count; count -= take_count) 
00215     {
00216         unsigned i = find_max_block();
00217         take_count = block_bits_take_[i];
00218         if (take_count > count)
00219             take_count = count;
00220         if (take_count == 0)
00221             continue;
00222         const bm::word_t* blk_src = bman_in.get_block(i);
00223         BM_ASSERT(blk_src);
00224 
00225         // allocate target block
00226         bm::word_t* blk_out = bman_out.get_block(i);
00227         if (blk_out != 0)
00228         {
00229             blk_out = bman_out.deoptimize_block(i);
00230         } 
00231         else
00232         {            
00233             blk_out = bman_out.get_allocator().alloc_bit_block();
00234             bman_out.set_block(i, blk_out);
00235         }
00236         if (take_count == block_counts_[i])
00237         {
00238             // copy the whole src block
00239             if (BM_IS_GAP(blk_src))
00240             {
00241                 gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
00242             }
00243             else
00244             {
00245                 bm::bit_block_copy(blk_out, blk_src);                
00246             }
00247             block_bits_take_[i] = 0; // exclude block from any farther picking
00248             continue;
00249         }
00250         bit_block_set(blk_out, 0);
00251 
00252         if (block_counts_[i] < 4096) // use array shuffle
00253         {
00254             unsigned arr_len;
00255             // convert source block to bit-block
00256             if (BM_IS_GAP(blk_src))
00257             {
00258                 arr_len = gap_convert_to_arr(bit_list_,
00259                                              BMGAP_PTR(blk_src),
00260                                              bm::gap_max_bits);
00261             }
00262             else // bit-block
00263             {
00264                 arr_len = bit_convert_to_arr(bit_list_, 
00265                                              blk_src, 
00266                                              bm::gap_max_bits, 
00267                                              bm::gap_max_bits,
00268                                              0);
00269             }
00270             BM_ASSERT(arr_len);
00271             get_random_array(blk_out, bit_list_, arr_len, take_count);
00272         }
00273         else // dense block
00274         {
00275             // convert source block to bit-block
00276             if (BM_IS_GAP(blk_src))
00277             {
00278                 gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
00279                 blk_src = tmp_block;
00280             }
00281 
00282             // pick random bits source block to target
00283             get_random_subset(blk_out, blk_src, take_count);
00284         }
00285         
00286         block_bits_take_[i] = 0; // exclude block from any farther picking
00287     }
00288 }
00289 
00290 template<class BV>
00291 void random_subset<BV>::get_random_subset(bm::word_t*       blk_out, 
00292                                           const bm::word_t* blk_src,
00293                                           unsigned          take_count)
00294 {
00295     for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
00296     {
00297         // pick random scan start and scan direction
00298         //
00299         unsigned i = rand() % bm::set_block_size;
00300         unsigned new_count;
00301 
00302         for (; i < bm::set_block_size && take_count; ++i)
00303         {
00304             if (blk_src[i] && (blk_out[i] != blk_src[i]))
00305             {
00306                 take_count -= new_count = 
00307                     process_word(blk_out, blk_src, i, take_count);
00308             }
00309         } // for i
00310 
00311     } // for
00312     // if masked scan did not produce enough results..
00313     //
00314     if (take_count)
00315     {
00316         // Find all vacant bits: do logical (src SUB out)
00317         for (unsigned i = 0; i < bm::set_block_size; ++i)
00318         {
00319             sub_block_[i] = blk_src[i] & ~blk_out[i];
00320         }
00321         // now transform vacant bits to array, then pick random elements
00322         //
00323         unsigned arr_len = bit_convert_to_arr(bit_list_, 
00324                                               sub_block_, 
00325                                               bm::gap_max_bits, 
00326                                               bm::gap_max_bits,
00327                                               0);
00328         BM_ASSERT(arr_len);
00329         get_random_array(blk_out, bit_list_, arr_len, take_count);        
00330     }
00331 }
00332 
00333 template<class BV>
00334 unsigned random_subset<BV>::process_word(bm::word_t*       blk_out, 
00335                                          const bm::word_t* blk_src,
00336                                          unsigned          i,
00337                                          unsigned          count)
00338 {
00339     unsigned new_bits, mask;
00340 
00341     do 
00342     {    
00343         mask = rand();
00344         mask ^= mask << 16;
00345     } while (mask == 0);
00346 
00347     unsigned src_rand = blk_src[i] & mask;    
00348     new_bits = src_rand & ~blk_out[i];
00349 
00350     if (new_bits)
00351     {
00352         unsigned new_count = bm::word_bitcount(new_bits);
00353 
00354         // check if we accidentally picked more bits than needed
00355         if (new_count > count)
00356         {
00357             BM_ASSERT(count);
00358 
00359             unsigned char blist[64];
00360             unsigned arr_size = bm::bit_list_4(new_bits, blist);
00361             BM_ASSERT(arr_size == new_count);
00362             std::random_shuffle(blist, blist + arr_size);
00363             unsigned value = 0;
00364             for (unsigned j = 0; j < count; ++j)
00365             {
00366                 value |= (1 << blist[j]);
00367             }
00368             new_bits = value;
00369             new_count = count;
00370 
00371             BM_ASSERT(bm::word_bitcount(new_bits) == count);
00372             BM_ASSERT((new_bits & ~blk_src[i]) == 0);
00373         }
00374 
00375         blk_out[i] |= new_bits;
00376         return new_count;
00377     }
00378 
00379     return 0;    
00380 }
00381 
00382 
00383 template<class BV>
00384 void random_subset<BV>::get_random_array(bm::word_t*       blk_out, 
00385                                          bm::gap_word_t*   bit_list,
00386                                          unsigned          bit_list_size,
00387                                          unsigned          count)
00388 {
00389     std::random_shuffle(bit_list, bit_list + bit_list_size);
00390     for (unsigned i = 0; i < count; ++i)
00391     {
00392         bm::set_bit(blk_out, bit_list[i]);
00393     }
00394 }
00395 
00396 template<class BV>
00397 unsigned random_subset<BV>::find_max_block() 
00398 {
00399     if (candidates_count_)
00400     {
00401         return block_candidates_[--candidates_count_];
00402     }
00403 
00404     unsigned candidate = 0;
00405     unsigned max_i = 0;
00406     for (unsigned i = 0; i <= blocks_; ++i)
00407     {
00408         if (block_bits_take_[i] == 0) continue;
00409         if (block_bits_take_[i] == candidate)
00410         {
00411             block_candidates_[candidates_count_++] = i;
00412         }
00413         else
00414         {
00415             unsigned diff = abs((int)block_bits_take_[i] - (int)candidate);
00416             double d = (double)diff / (double)candidate;
00417 
00418             if (d < 0.20f) // delta is statistically insignificant
00419             {
00420                 block_candidates_[candidates_count_++] = i;
00421             }
00422             else
00423             if (block_bits_take_[i] > candidate)
00424             {
00425                  candidate = block_bits_take_[i];
00426                  max_i = i;
00427                  candidates_count_ = 0;
00428                  block_candidates_[candidates_count_++] = i;
00429             }
00430         }
00431     }
00432 
00433     if (candidates_count_ > 1)
00434     {
00435         std::random_shuffle(block_candidates_, block_candidates_ + candidates_count_);
00436         return find_max_block();
00437     }
00438     return max_i;
00439 }
00440 
00441 
00442 } // namespace
00443 
00444 #include "bmundef.h"
00445 
00446 #endif