0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
BloomFilterWithChecksum.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 3
9  * of the License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
29 #ifndef HYPERTABLE_BLOOM_FILTER_WITH_CHECKSUM_H
30 #define HYPERTABLE_BLOOM_FILTER_WITH_CHECKSUM_H
31 
32 #include <cmath>
33 #include <limits.h>
34 #include "Common/Checksum.h"
35 #include "Common/Filesystem.h"
36 #include "Common/Logger.h"
37 #include "Common/MurmurHash.h"
38 #include "Common/Serialization.h"
39 #include "Common/StaticBuffer.h"
40 #include "Common/StringExt.h"
41 #include "Common/System.h"
42 
43 namespace Hypertable {
44 
53 template <class HasherT = MurmurHash2>
55 public:
62  BasicBloomFilterWithChecksum(size_t items_estimate,
63  float false_positive_prob) {
64  m_items_actual = 0;
65  m_items_estimate = items_estimate;
66  m_false_positive_prob = false_positive_prob;
67  double num_hashes = -std::log(m_false_positive_prob) / std::log(2);
68  m_num_hash_functions = (size_t)num_hashes;
69  m_num_bits = (size_t)(m_items_estimate * num_hashes / std::log(2));
70  if (m_num_bits == 0) {
72  "Num elements=%lu false_positive_prob=%.3f",
73  (Lu)items_estimate, false_positive_prob);
74  }
75  m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
76  m_bloom_base = new uint8_t[total_size()];
78  memset(m_bloom_base, 0, total_size());
79 
80  HT_DEBUG_OUT << "num funcs=" << m_num_hash_functions << " num bits="
81  << m_num_bits << " num bytes= " << m_num_bytes << " bits per element="
82  << double(m_num_bits) / items_estimate << HT_END;
83  }
84 
92  BasicBloomFilterWithChecksum(size_t items_estimate, float bits_per_item,
93  size_t num_hashes) {
94  m_items_actual = 0;
95  m_items_estimate = items_estimate;
97  m_num_hash_functions = num_hashes;
98  m_num_bits = (size_t)((double)items_estimate * (double)bits_per_item);
99  if (m_num_bits == 0) {
100  HT_THROWF(Error::EMPTY_BLOOMFILTER, "Num elements=%lu bits_per_item=%.3f",
101  (Lu)items_estimate, bits_per_item);
102  }
103  m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
104  m_bloom_base = new uint8_t[total_size()];
106  memset(m_bloom_base, 0, total_size());
107 
108  HT_DEBUG_OUT << "num funcs=" << m_num_hash_functions << " num bits="
109  << m_num_bits << " num bytes=" << m_num_bytes << " bits per element="
110  << double(m_num_bits) / items_estimate << HT_END;
111  }
112 
121  BasicBloomFilterWithChecksum(size_t items_estimate, size_t items_actual,
122  int64_t length, size_t num_hashes) {
123  m_items_actual = items_actual;
124  m_items_estimate = items_estimate;
125  m_false_positive_prob = 0.0;
126  m_num_hash_functions = num_hashes;
127  m_num_bits = (size_t)length;
128  if (m_num_bits == 0) {
130  "Estimated items=%lu actual items=%lu length=%lld num hashes=%lu",
131  (Lu)items_estimate, (Lu)items_actual, (Lld)length, (Lu)num_hashes);
132  }
133  m_num_bytes = (m_num_bits / CHAR_BIT) + (m_num_bits % CHAR_BIT ? 1 : 0);
134  m_bloom_base = new uint8_t[total_size()];
136  memset(m_bloom_base, 0, total_size());
137 
138  HT_DEBUG_OUT << "num funcs=" << m_num_hash_functions << " num bits="
139  << m_num_bits << " num bytes=" << m_num_bytes << " bits per element="
140  << double(m_num_bits) / items_estimate << HT_END;
141  }
142 
145  delete[] m_bloom_base;
146  }
147 
148  /* XXX/review static functions to expose the bloom filter parameters, given
149  1) probablility and # keys
150  2) #keys and fix the total size (m), to calculate the optimal
151  probability - # hash functions to use
152  */
153 
162  void insert(const void *key, size_t len) {
163  uint32_t hash = len;
164 
165  for (size_t i = 0; i < m_num_hash_functions; ++i) {
166  hash = m_hasher(key, len, hash) % m_num_bits;
167  m_bloom_bits[hash / CHAR_BIT] |= (1 << (hash % CHAR_BIT));
168  }
169  m_items_actual++;
170  }
171 
176  void insert(const String &key) {
177  insert(key.c_str(), key.length());
178  }
179 
191  bool may_contain(const void *key, size_t len) const {
192  uint32_t hash = len;
193  uint8_t byte_mask;
194  uint8_t byte;
195 
196  for (size_t i = 0; i < m_num_hash_functions; ++i) {
197  hash = m_hasher(key, len, hash) % m_num_bits;
198  byte = m_bloom_bits[hash / CHAR_BIT];
199  byte_mask = (1 << (hash % CHAR_BIT));
200 
201  if ((byte & byte_mask) == 0) {
202  return false;
203  }
204  }
205  return true;
206  }
207 
213  bool may_contain(const String &key) const {
214  return may_contain(key.c_str(), key.length());
215  }
216 
221  void serialize(StaticBuffer &buf) {
222  buf.set(m_bloom_base, total_size(), false);
223  uint8_t *ptr = buf.base;
225  }
226 
232  uint8_t *base() { return m_bloom_base; }
233 
242  const uint8_t *ptr = m_bloom_base;
243  size_t remain = 4;
244  uint32_t stored_checksum = Serialization::decode_i32(&ptr, &remain);
245  uint32_t computed_checksum = fletcher32(m_bloom_bits, m_num_bytes);
246  if (stored_checksum != computed_checksum)
248  }
249 
254  size_t size() { return m_num_bytes; }
255 
261  size_t total_size() {
263  }
264 
270 
275  size_t get_length_bits() { return m_num_bits; }
276 
282 
287  size_t get_items_actual() { return m_items_actual; }
288 
289 private:
291  HasherT m_hasher;
292 
295 
298 
301 
304 
306  size_t m_num_bits;
307 
309  size_t m_num_bytes;
310 
312  uint8_t *m_bloom_bits;
313 
315  uint8_t *m_bloom_base;
316 };
317 
319 
320 } // namespace Hypertable
321 
322 #endif // HYPERTABLE_BLOOM_FILTER_WITH_CHECKSUM_H
size_t get_num_hashes()
Getter for the number of hash functions.
size_t get_items_actual()
Getter for the actual number of items.
A memory buffer of static size.
Definition: StaticBuffer.h:45
Retrieves system information (hardware, installation directory, etc)
#define HT_IO_ALIGNMENT_PADDING(size)
Definition: Filesystem.h:54
void set(uint8_t *data, uint32_t len, bool take_ownership=true)
Sets data pointer; the existing buffer is discarded and deleted.
Definition: StaticBuffer.h:175
static String filename
Definition: Config.cc:48
MurmurHash2 digest routine.
Abstract base class for a filesystem.
size_t get_items_estimate()
Getter for the estimated number of items.
std::string String
A String is simply a typedef to std::string.
Definition: String.h:44
bool may_contain(const String &key) const
Overloaded may_contain function for Strings.
A space-efficent probabilistic set for membership test, false postives are possible, but false negatives are not.
uint32_t decode_i32(const uint8_t **bufp, size_t *remainp)
Decode a 32-bit integer in little-endian order.
size_t m_num_hash_functions
Number of hash functions.
uint32_t fletcher32(const void *data8, size_t len8)
Compute fletcher32 checksum for arbitary data.
Definition: Checksum.cc:42
void serialize(StaticBuffer &buf)
Serializes the BloomFilter into a static memory buffer.
size_t total_size()
Getter for the total size (including checksum and metadata)
BasicBloomFilterWithChecksum(size_t items_estimate, size_t items_actual, int64_t length, size_t num_hashes)
Alternative constructor.
Logging routines and macros.
void encode_i32(uint8_t **bufp, uint32_t val)
Encode a 32-bit integer in little-endian order.
size_t m_items_estimate
Estimated number of items.
void insert(const void *key, size_t len)
Inserts a new blob into the hash.
#define HT_END
Definition: Logger.h:220
Functions to serialize/deserialize primitives to/from a memory buffer.
uint8_t * m_bloom_base
The serialized bloom filter data, including metadata and checksums.
A memory buffer of static size.
float m_false_positive_prob
Probability of returning a false positive.
BasicBloomFilterWithChecksum(size_t items_estimate, float false_positive_prob)
Constructor.
size_t get_length_bits()
Getter for the number of bits.
size_t size()
Getter for the bloom filter size.
Hypertable definitions
long long int Lld
Shortcut for printf formats.
Definition: String.h:53
Implementation of checksum routines.
~BasicBloomFilterWithChecksum()
Destructor; releases resources.
HasherT m_hasher
The hash function implementation.
uint8_t * m_bloom_bits
The actual bloom filter bit-array.
void validate(String &filename)
Validates the checksum of the BloomFilter.
#define HT_THROWF(_code_, _fmt_,...)
Definition: Error.h:490
size_t m_items_actual
Actual number of items.
BasicBloomFilterWithChecksum(size_t items_estimate, float bits_per_item, size_t num_hashes)
Alternative constructor.
void insert(const String &key)
Overloaded insert function for Strings.
long unsigned int Lu
Shortcut for printf formats.
Definition: String.h:47
BasicBloomFilterWithChecksum BloomFilterWithChecksum
String extensions and helpers: sets, maps, append operators etc.
#define HT_THROW(_code_, _msg_)
Definition: Error.h:478
bool may_contain(const void *key, size_t len) const
Checks if the data set "may" contain the key.
#define HT_DEBUG_OUT
Definition: Logger.h:261
size_t m_num_bytes
Number of bytes (approx.
uint8_t * base()
Getter for the serialized bloom filter data, including metadata and checksums.