0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
CellStoreBlockIndexArray.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; version 3 of the
9  * License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
29 #ifndef Hypertable_RangeServer_CellStoreBlockIndexArray_h
30 #define Hypertable_RangeServer_CellStoreBlockIndexArray_h
31 
32 #include "CellList.h"
33 #include "CellListScannerBuffer.h"
34 
35 #include <Hypertable/Lib/Key.h>
38 
39 #include <Common/StaticBuffer.h>
40 
41 #include <cassert>
42 #include <iostream>
43 #include <map>
44 
45 namespace Hypertable {
46 
51  template <typename OffsetT>
53  public:
56 
58  OffsetT offset;
59  };
60 
61  template <typename OffsetT>
65  return x.key < y.key;
66  }
67  };
68 
72  template <typename OffsetT>
74  public:
75  typedef typename std::vector<CellStoreBlockIndexElementArray<OffsetT> >::iterator ArrayIteratorT;
76 
78  CellStoreBlockIndexIteratorArray(ArrayIteratorT iter) : m_iter(iter) { }
79  SerializedKey key() { return (*m_iter).key; }
80  int64_t value() { return (int64_t)(*m_iter).offset; }
84  ++(*this);
85  return copy;
86  }
88  return m_iter == other.m_iter;
89  }
91  return m_iter != other.m_iter;
92  }
93  protected:
94  ArrayIteratorT m_iter;
95  };
96 
100  template <typename OffsetT>
102  public:
106  typedef typename std::vector<ElementT> ArrayT;
107  typedef typename std::vector<ElementT>::iterator ArrayIteratorT;
108 
110 
111  void load(DynamicBuffer &fixed, DynamicBuffer &variable,int64_t end_of_data,
112  const String &start_row="", const String &end_row="") {
113  size_t total_entries = fixed.fill() / sizeof(OffsetT);
114  SerializedKey key;
115  OffsetT offset;
116  ElementT ee;
117  const uint8_t *key_ptr;
118  bool in_scope = (start_row == "") ? true : false;
119  bool check_for_end_row = end_row != "";
120  const uint8_t *variable_start = variable.base;
121  const uint8_t *variable_end = variable.ptr;
122 
123  assert(variable.own);
124 
125  m_end_of_last_block = end_of_data;
126 
127  fixed.ptr = fixed.base;
128  key_ptr = variable.base;
129 
130  for (size_t i=0; i<total_entries; ++i) {
131 
132  // variable portion
133  key.ptr = key_ptr;
134  key_ptr += key.length();
135 
136  // fixed portion (e.g. offset)
137  memcpy(&offset, fixed.ptr, sizeof(offset));
138  fixed.ptr += sizeof(offset);
139 
140  if (!in_scope) {
141  if (strcmp(key.row(), start_row.c_str()) <= 0)
142  continue;
143  variable_start = key.ptr;
144  in_scope = true;
145  }
146 
147  if (check_for_end_row && strcmp(key.row(), end_row.c_str()) > 0) {
148  ee.key = key;
149  ee.offset = offset;
150  m_array.push_back(ee);
151  if (i+1 < total_entries) {
152  key.ptr = key_ptr;
153  key_ptr += key.length();
154  variable_end = key_ptr;
155  memcpy(&m_end_of_last_block, fixed.ptr, sizeof(offset));
156  }
157  break;
158  }
159  ee.key = key;
160  ee.offset = offset;
161  m_array.push_back(ee);
162  }
163 
164  HT_ASSERT(key_ptr <= variable.ptr);
165 
166  if (!m_array.empty()) {
167  HT_ASSERT(variable_start < variable_end);
168 
169  // Copy portion of variable buffer used to m_keydata and fixup the
170  // array pointers to point into this new buffer
171  StaticBuffer keydata(variable_end-variable_start);
172  memcpy(keydata.base, variable_start, variable_end-variable_start);
173  for (auto &element : m_array) {
174  HT_ASSERT(element.key.ptr < variable_end);
175  uint64_t offset = element.key.ptr - variable_start;
176  element.key.ptr = keydata.base + offset;
177  }
178  m_keydata.free();
179  m_keydata = keydata;
180 
181  // compute space covered by this index scope
182  m_disk_used = m_end_of_last_block - (*m_array.begin()).offset;
183 
184  // determine split key
185  size_t mid_point = (m_array.size()==2) ? 0 : m_array.size()/2;
186  m_middle_key = m_array[mid_point].key;
187  }
188 
189  // Free variable buf here to maintain original semantics
190  variable.free();
191 
192  // The first time this method is called, it is called with the entire
193  // index, so save the size of the entire index for "fraction covered"
194  // calcuations
195  if (m_maximum_entries == (OffsetT)-1)
196  m_maximum_entries = (OffsetT)total_entries;
197  }
198 
199  void rescope(const String &start_row="", const String &end_row="") {
200  DynamicBuffer fixed(m_array.size() * sizeof(OffsetT));
201  DynamicBuffer variable;
202 
203  // Transfer ownership of m_keydata buffer to variable
204  m_keydata.own = false;
205  variable.base = variable.mark = m_keydata.base;
206  variable.size = m_keydata.size;
207  variable.ptr = variable.base + variable.size;
208 
209  // Populate fixed array
210  for (auto &element : m_array)
211  fixed.add_unchecked(&element.offset, sizeof(OffsetT));
212  m_array.clear();
213 
214  // Perform normal load
215  load(fixed, variable, m_end_of_last_block, start_row, end_row);
216  }
217 
218 
219  void display() {
220  SerializedKey last_key;
221  int64_t last_offset = 0;
222  int64_t block_size;
223  size_t i=0;
224  for (ArrayIteratorT iter = m_array.begin(); iter != m_array.end(); ++iter) {
225  if (last_key) {
226  block_size = (*iter).offset - last_offset;
227  std::cout << i << ": offset=" << last_offset << " size=" << block_size
228  << " row=" << last_key.row() << "\n";
229  i++;
230  }
231  last_offset = (*iter).offset;
232  last_key = (*iter).key;
233  }
234  if (last_key) {
235  block_size = m_end_of_last_block - last_offset;
236  std::cout << i << ": offset=" << last_offset << " size=" << block_size
237  << " row=" << last_key.row() << std::endl;
238  }
239  std::cout << "sizeof(OffsetT) = " << sizeof(OffsetT) << std::endl;
240  }
241 
248  int32_t keys_per_block) {
249  const char *row, *last_row = 0;
250  int64_t last_count = 0;
251  for (auto &e : m_array) {
252  row = e.key.row();
253  if (last_row == 0)
254  last_row = row;
255  if (strcmp(row, last_row) != 0) {
256  CstrToInt64MapT::iterator iter = split_row_data.find(last_row);
257  if (iter == split_row_data.end())
258  split_row_data[last_row] = last_count;
259  else
260  iter->second += last_count;
261  last_row = row;
262  last_count = 0;
263  }
264  last_count += keys_per_block;
265  }
266  // Deliberately skipping last entry in m_array because it is
267  // larger than end_row
268  }
269 
302  const String &filename,
303  int32_t keys_per_block,
304  float compression_ratio) {
305  Key key;
306  DynamicBuffer qualifier(filename.length() + 32);
307  DynamicBuffer serial_key_buf;
308  DynamicBuffer value_buf(32);
309  char buf[32];
310  char *offset_ptr;
311  const char *offset_format = (sizeof(OffsetT) == 4) ? "%08llX" : "%016llX";
312  double size;
313  OffsetT next_offset;
314 
315  qualifier.add_unchecked(filename.c_str(), filename.length());
316  qualifier.add_unchecked(":", 1);
317  offset_ptr = (char *)qualifier.ptr;
318 
319  for (size_t i=0; i<m_array.size(); i++) {
320 
321  if (i < (m_array.size()-1))
322  next_offset = m_array[i+1].offset;
323  else
324  next_offset = m_end_of_last_block;
325 
326  key.load(m_array[i].key);
327  sprintf(offset_ptr, offset_format, (long long)m_array[i].offset);
328 
329  // Size key
330  serial_key_buf.clear();
331  create_key_and_append(serial_key_buf, FLAG_INSERT, key.row,
333  (const char *)qualifier.base, key.timestamp,
334  key.revision);
335  // Size value
336  value_buf.clear();
337  size = (double)(next_offset - m_array[i].offset) / (double)compression_ratio;
338  sprintf(buf, "%lu", (unsigned long)size);
339  Serialization::encode_vi32(&value_buf.ptr, strlen(buf));
340  strcpy((char *)value_buf.ptr, buf);
341 
342  scanner->add( SerializedKey(serial_key_buf.base), ByteString(value_buf.base) );
343 
344  // CompressedSize key
345  serial_key_buf.clear();
346  create_key_and_append(serial_key_buf, FLAG_INSERT, key.row,
348  (const char *)qualifier.base, key.timestamp,
349  key.revision);
350  // CompressedSize value
351  value_buf.clear();
352  sprintf(buf, "%lu", (unsigned long)(next_offset - m_array[i].offset));
353  Serialization::encode_vi32(&value_buf.ptr, strlen(buf));
354  strcpy((char *)value_buf.ptr, buf);
355 
356  scanner->add( SerializedKey(serial_key_buf.base), ByteString(value_buf.base) );
357 
358  // KeyCount key
359  serial_key_buf.clear();
360  create_key_and_append(serial_key_buf, FLAG_INSERT, key.row,
362  (const char *)qualifier.base, key.timestamp,
363  key.revision);
364  // KeyCount value
365  value_buf.clear();
366  sprintf(buf, "%lu", (unsigned long)keys_per_block);
367  Serialization::encode_vi32(&value_buf.ptr, strlen(buf));
368  strcpy((char *)value_buf.ptr, buf);
369 
370  scanner->add( SerializedKey(serial_key_buf.base), ByteString(value_buf.base) );
371 
372  }
373  }
374 
375  size_t memory_used() {
376  return m_keydata.size + (m_array.size() * (sizeof(ElementT)));
377  }
378 
379  int64_t disk_used() { return m_disk_used; }
380 
381  double fraction_covered() {
382  HT_ASSERT(m_maximum_entries != (OffsetT)-1);
383  return (double)m_array.size() / (double)m_maximum_entries;
384  }
385 
387 
388  int64_t index_entries() { return m_array.size(); }
389 
390  iterator begin() {
391  return iterator(m_array.begin());
392  }
393 
394  iterator end() {
395  return iterator(m_array.end());
396  }
397 
398  iterator lower_bound(const SerializedKey& k) {
399  ElementT ee(k);
400  return iterator(std::lower_bound(m_array.begin(), m_array.end(), ee, LtT()));
401  }
402 
403  iterator upper_bound(const SerializedKey& k) {
404  ElementT ee(k);
405  return iterator(std::upper_bound(m_array.begin(), m_array.end(), ee, LtT()));
406  }
407 
408  void clear() {
409  m_array.clear();
410  m_keydata.free();
411  m_middle_key.ptr = 0;
412  m_maximum_entries = (OffsetT)-1;
413  }
414 
415  private:
416  ArrayT m_array;
420  OffsetT m_disk_used;
422  };
423 
426 } // namespace Hypertable
427 
428 #endif // Hypertable_RangeServer_CellStoreBlockIndexArray_h
void free()
Frees resources.
A memory buffer of static size.
Definition: StaticBuffer.h:45
CellStoreBlockIndexIteratorArray operator++(int)
CellStoreBlockIndexIteratorArray & operator++()
Cell list scanner over a buffer of cells.
static String filename
Definition: Config.cc:48
Type declarations for PseudoTables class.
std::string String
A String is simply a typedef to std::string.
Definition: String.h:44
bool operator==(const CellStoreBlockIndexIteratorArray &other)
const char * row() const
Definition: SerializedKey.h:53
static const uint32_t FLAG_INSERT
Definition: KeySpec.h:47
CellStoreBlockIndexElementArray(const SerializedKey &key_)
.cellstore.index CompressedSize column family
Definition: PseudoTables.h:53
Hypertable::CellStoreBlockIndexElementArray< OffsetT > ElementT
Declarations for CellListScannerBuffer.
iterator upper_bound(const SerializedKey &k)
uint8_t * ptr
Pointer to the end of the used part of the buffer.
A dynamic, resizable and reference counted memory buffer.
Definition: DynamicBuffer.h:42
.cellstore.index KeyCount column family
Definition: PseudoTables.h:55
A class managing one or more serializable ByteStrings.
Definition: ByteString.h:47
#define HT_ASSERT(_e_)
Definition: Logger.h:396
std::map< const char *, int64_t, LtCstr, SplitRowDataAlloc > SplitRowDataMapT
Definition: CellList.h:66
bool operator()(const CellStoreBlockIndexElementArray< OffsetT > &x, const CellStoreBlockIndexElementArray< OffsetT > &y) const
std::vector< ElementT >::iterator ArrayIteratorT
uint32_t size
The size of the allocated memory buffer (base)
void add(const SerializedKey key, const ByteString value)
Adds a key/value pair to the buffer.
bool load(const SerializedKey &key)
Parses the opaque key and loads the components into the member variables.
Definition: Key.cc:158
A memory buffer of static size.
size_t length() const
Retrieves the length of the serialized string.
Definition: ByteString.h:62
void free()
Clears the data; if this object is owner of the data then the allocated buffer is delete[]d...
Definition: StaticBuffer.h:185
bool own
If true then the buffer (base) will be released when going out of scope; if false then the caller has...
const uint8_t * ptr
The pointer to the serialized data.
Definition: ByteString.h:121
Hypertable::CellStoreBlockIndexIteratorArray< OffsetT > iterator
Hypertable definitions
void encode_vi32(uint8_t **bufp, uint32_t val)
Encode a integer (up to 32-bit) in variable length encoding.
void copy(TableDumper &, CellsBuilder &)
Definition: TableDumper.cc:129
void create_key_and_append(DynamicBuffer &dst_buf, const Key &key, bool time_order_asc)
Definition: Key.cc:105
void load(DynamicBuffer &fixed, DynamicBuffer &variable, int64_t end_of_data, const String &start_row="", const String &end_row="")
void clear()
Clears the buffer.
std::vector< CellStoreBlockIndexElementArray< OffsetT > >::iterator ArrayIteratorT
Provides access to internal components of opaque key.
Definition: Key.h:40
uint8_t * base
Pointer to the allocated memory buffer.
size_t fill() const
Returns the size of the used portion.
Definition: DynamicBuffer.h:70
.cellstore.index Size column family
Definition: PseudoTables.h:51
void populate_pseudo_table_scanner(CellListScannerBuffer *scanner, const String &filename, int32_t keys_per_block, float compression_ratio)
Populates scanner with data for .cellstore.index pseudo table.
uint8_t * mark
A "bookmark", can be set by the caller.
void rescope(const String &start_row="", const String &end_row="")
Provides an STL-style iterator on CellStoreBlockIndex objects.
uint8_t * add_unchecked(const void *data, size_t len)
Adds additional data without boundary checks.
bool operator!=(const CellStoreBlockIndexIteratorArray &other)
void unique_row_count_estimate(CellList::SplitRowDataMapT &split_row_data, int32_t keys_per_block)
Accumulates unique row estimates from block index entries.
Hypertable::LtCellStoreBlockIndexElementArray< OffsetT > LtT
iterator lower_bound(const SerializedKey &k)