0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
DataGenerator.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; version 3 of the
9  * License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
22 #include <Common/Compat.h>
23 
24 #include "DataGenerator.h"
25 #include "DataGeneratorRandom.h"
26 
27 #include <boost/algorithm/string.hpp>
28 #include <boost/algorithm/string/predicate.hpp>
29 
30 #include <cstdlib>
31 
32 extern "C" {
33 #include <limits.h>
34 #include <strings.h>
35 }
36 
37 using namespace Hypertable;
38 using namespace Hypertable::Config;
39 using namespace boost;
40 
42  : m_generator(generator), m_keys_only(generator->m_keys_only),
43  m_amount(0), m_count(0), m_last_data_size(0) {
44  RowComponent *row_comp;
45  Column *column;
46 
47  for (size_t i=0; i<generator->m_row_component_specs.size(); i++) {
48  if (generator->m_row_component_specs[i].type == INTEGER)
49  row_comp = new RowComponentInteger( generator->m_row_component_specs[i] );
50  else if (generator->m_row_component_specs[i].type == STRING)
51  row_comp = new RowComponentString( generator->m_row_component_specs[i] );
52  else if (generator->m_row_component_specs[i].type == TIMESTAMP)
53  row_comp = new RowComponentTimestamp( generator->m_row_component_specs[i] );
54  else
55  HT_ASSERT(!"Unrecognized row component type");
56  m_row_components.push_back(row_comp);
57  }
58 
59  if (m_keys_only) {
60  m_cell.value = (const ::uint8_t *)"";
61  m_cell.value_len = 0;
62  }
63 
64  for (size_t i=0; i<generator->m_column_specs.size(); i++) {
65  column = new ColumnString( generator->m_column_specs[i], m_keys_only );
66  m_columns.push_back(column);
67  }
68  m_next_column = m_columns.size() - 1;
69 
70  next();
71 }
72 
73 
75  size_t compi = m_row_components.size();
76 
77  HT_ASSERT(compi > 0);
78 
79  if (m_columns.size() <= 1)
80  m_next_column = 0;
81  else
83 
84  do {
85  compi--;
86  if (m_row_components[compi]->next())
87  break;
88  } while (compi > 0);
89 
90  m_row.clear();
91 
92  for (size_t i=0; i<m_row_components.size(); i++)
93  m_row_components[i]->render(m_row);
94 
95  m_cell.row_key = m_row.c_str();
96 
97  if (!m_columns.empty()) {
98  m_columns[m_next_column]->next();
99  m_cell.column_family = m_columns[m_next_column]->column_family.c_str();
100  m_cell.column_qualifier = m_columns[m_next_column]->qualifier().c_str();
101  if (!m_keys_only) {
102  m_cell.value = (const ::uint8_t *)m_columns[m_next_column]->value();
103  m_cell.value_len = m_columns[m_next_column]->value_len();
104  }
107  }
108  else {
109  m_last_data_size = m_row.length();
111  }
112  m_count++;
113 }
114 
116  next();
117  return *this;
118 }
119 
121  next();
122  for (int i=1; i<n; i++)
123  next();
124  return *this;
125 }
126 
127 
128 
129 DataGenerator::DataGenerator(PropertiesPtr &props, bool keys_only) : m_props(props), m_keys_only(keys_only) {
130  int rowkey_order;
131  string rowkey_distribution;
132  unsigned int rowkey_seed;
133  string str;
134  std::map<String, int> column_map;
135 
136  if (has("DataGenerator.MaxBytes"))
137  m_max_bytes = get_i64("DataGenerator.MaxBytes");
138  else
139  m_max_bytes = m_props->get_i64("DataGenerator.MaxBytes", std::numeric_limits< ::int64_t >::max());
140 
141  if (has("DataGenerator.MaxKeys"))
142  m_max_keys = get_i64("DataGenerator.MaxKeys");
143  else
144  m_max_keys = m_props->get_i64("DataGenerator.MaxKeys", std::numeric_limits< ::int64_t >::max());
145 
146  if (has("DataGenerator.Seed"))
147  m_seed = get_i32("DataGenerator.Seed");
148  else
149  m_seed = m_props->get_i32("DataGenerator.Seed", 1);
150 
151  rowkey_order = parse_order( m_props->get_str("rowkey.order", "ascending") );
152  rowkey_distribution = m_props->get_str("rowkey.distribution", "uniform");
153  rowkey_seed = m_props->get_i32("rowkey.seed", 1);
154 
155  std::vector<String> names;
156  string name;
157  m_props->get_names(names);
158  long index = 0;
159  char *ptr, *tptr;
160 
161  for (size_t i=0; i<names.size(); i++) {
162  if (starts_with(names[i], "rowkey.component.")) {
163  index = strtol(names[i].c_str()+17, &ptr, 0);
164  if (index < 0 || index > 100)
165  HT_THROW(Error::SYNTAX_ERROR, format("Bad format for key %s", names[i].c_str()));
166  if (m_row_component_specs.size() <= (size_t)index)
167  m_row_component_specs.resize(index+1);
168  if (!strcmp(ptr, ".type")) {
169  str = m_props->get_str(names[i]);
170  if (!strcasecmp(str.c_str(), "integer"))
171  m_row_component_specs[index].type = INTEGER;
172  else if (!strcasecmp(str.c_str(), "string"))
173  m_row_component_specs[index].type = STRING;
174  else if (!strcasecmp(str.c_str(), "timestamp"))
175  m_row_component_specs[index].type = TIMESTAMP;
176  else
177  HT_THROW(Error::SYNTAX_ERROR, format("Invalid rowkey component type - %s", str.c_str()));
178  }
179  else if (!strcmp(ptr, ".format")) {
180  m_row_component_specs[index].format = m_props->get_str(names[i]);
181  boost::trim_if(m_row_component_specs[index].format, boost::is_any_of("'\""));
182  }
183  else if (!strcmp(ptr, ".min")) {
184  m_row_component_specs[index].min = m_props->get_str(names[i]);
185  }
186  else if (!strcmp(ptr, ".max")) {
187  m_row_component_specs[index].max = m_props->get_str(names[i]);
188  }
189  else if (!strcmp(ptr, ".values")) {
190  str = m_props->get_str(names[i]);
191  m_row_component_specs[index].value_count = (uint64_t)strtoll(str.c_str(), 0, 0);
192  }
193  else if (ends_with(ptr, ".order")) {
194  m_row_component_specs[index].order = parse_order( m_props->get_str(names[i]) );
195  }
196  else if (ends_with(ptr, ".distribution")) {
197  m_row_component_specs[index].distribution = m_props->get_str(names[i]);
198  }
199  else if (ends_with(ptr, ".seed")) {
200  str = m_props->get_str(names[i]);
201  m_row_component_specs[index].seed = atoi(str.c_str());
202  }
203  else if (ends_with(ptr, ".length.min")) {
204  str = m_props->get_str(names[i]);
205  m_row_component_specs[index].length_min = atoi(str.c_str());
206  }
207  else if (ends_with(ptr, ".length.max")) {
208  str = m_props->get_str(names[i]);
209  m_row_component_specs[index].length_max = atoi(str.c_str());
210  }
211  else
212  HT_THROW(Error::SYNTAX_ERROR, format("Invalid key - %s", names[i].c_str()));
213 
214  }
215  else if (strstr(names[i].c_str(), ".qualifier.") || strstr(names[i].c_str(), ".value.")) {
216  int columni;
217  name = String("") + names[i];
218  tptr = strchr((char *)name.c_str(), '.');
219  *tptr++ = 0;
220 
221  std::map<String, int>::iterator iter = column_map.find((String)name.c_str());
222  if (iter == column_map.end()) {
223  columni = column_map.size();
224  column_map[(String)name.c_str()] = columni;
225  m_column_specs.push_back(ColumnSpec());
226  m_column_specs[columni].column_family = name;
227  }
228  else
229  columni = (*iter).second;
230 
231  str = m_props->get_str(names[i]);
232 
233  if (!strcmp(tptr, "qualifier.type")) {
234  if (!strcasecmp(str.c_str(), "STRING"))
235  m_column_specs[columni].qualifier.type = STRING;
236  else
237  HT_THROW(Error::SYNTAX_ERROR, format("Unsupported type (%s) for '%s'",
238  str.c_str(), names[i].c_str()));
239  }
240  else if (!strcmp(tptr, "qualifier.size")) {
241  m_column_specs[columni].qualifier.size = atoi(str.c_str());
242  }
243  else if (!strcmp(tptr, "qualifier.charset")) {
244  m_column_specs[columni].qualifier.charset = str;
245  boost::trim_if(m_column_specs[columni].qualifier.charset, boost::is_any_of("'\""));
246  }
247  else if (!strcmp(tptr, "value.random")) {
248  if (!strcasecmp(str.c_str(), "false"))
249  m_column_specs[columni].order = ASCENDING;
250  else if (!strcasecmp(str.c_str(), "true"))
251  m_column_specs[columni].order = RANDOM;
252  else
253  HT_THROW(Error::SYNTAX_ERROR, format("Bad value for 'value.random'"));
254  }
255  else if (!strcmp(tptr, "value.seed")) {
256  m_column_specs[columni].seed = atoi(str.c_str());
257  }
258  else if (!strcmp(tptr, "value.size")) {
259  m_column_specs[columni].size = atoi(str.c_str());
260  }
261  else if (!strcmp(tptr, "value.source")) {
262  m_column_specs[columni].source = str;
263  }
264  else if (!strcmp(tptr, "value.source.words")) {
265  if (!strcasecmp(str.c_str(), "true"))
266  m_column_specs[columni].word_stream = true;
267  else if (!strcasecmp(str.c_str(), "false"))
268  m_column_specs[columni].word_stream = false;
269  else
270  HT_THROW(Error::SYNTAX_ERROR, format("Bad value for 'value.source.words'"));
271  }
272  else if (!strcmp(tptr, "value.cooked-source")) {
273  m_column_specs[columni].cooked_source = str;
274  }
275  else if (!strcmp(tptr, "value.fixed")) {
276  if (!strcasecmp(str.c_str(), "false"))
277  m_column_specs[columni].fixed = false;
278  else if (!strcasecmp(str.c_str(), "true"))
279  m_column_specs[columni].fixed = true;
280  else
281  HT_THROW(Error::SYNTAX_ERROR, format("Bad value for 'value.fixed'"));
282  }
283 
284  }
285  }
286 
287  if (!keys_only && m_column_specs.empty())
288  HT_FATAL("No columns specified");
289 
290  for (size_t i=0; i<m_column_specs.size(); i++) {
291  if (m_column_specs[i].qualifier.type != -1) {
292  if (m_column_specs[i].qualifier.size == -1)
294  format("No qualifier size specified for column '%s'",
295  m_column_specs[i].column_family.c_str()));
296  if (m_column_specs[i].qualifier.charset == "")
298  format("No qualifier charset specified for column '%s'",
299  m_column_specs[i].column_family.c_str()));
300  }
301  if (m_column_specs[i].size == -1)
303  format("No value size specified for column '%s'",
304  m_column_specs[i].column_family.c_str()));
305  }
306 
307  for (size_t i=0; i<m_row_component_specs.size(); i++) {
308  if (m_row_component_specs[i].order == -1)
309  m_row_component_specs[i].order = rowkey_order;
310  if (m_row_component_specs[i].distribution == "")
311  m_row_component_specs[i].distribution = rowkey_distribution;
312  if (m_row_component_specs[i].seed == (unsigned)-1)
313  m_row_component_specs[i].seed = rowkey_seed;
314  if (m_row_component_specs[i].type == -1)
315  HT_FATALF("Missing type for component %lu", (Lu)i);
316  else if (m_row_component_specs[i].type == INTEGER &&
317  m_row_component_specs[i].format != "") {
318  if (!strstr(m_row_component_specs[i].format.c_str(), "lld"))
319  HT_FATALF("Format sequence (%s) must contain 'lld'",
320  m_row_component_specs[i].format.c_str());
321  }
322  }
323 }
324 
325 
326 int DataGenerator::parse_order(const string &str) {
327  if (!strcasecmp(str.c_str(), "ascending"))
328  return ASCENDING;
329  else if (!strcasecmp(str.c_str(), "random"))
330  return RANDOM;
331  else
332  HT_THROW(Error::SYNTAX_ERROR, format("Unsupported order - %s", str.c_str()));
333 }
334 
Boost library.
Definition: Properties.cc:39
std::string String
A String is simply a typedef to std::string.
Definition: String.h:44
DataGeneratorIterator & operator++()
String format(const char *fmt,...)
Returns a String using printf like format facilities Vanilla snprintf is about 1.5x faster than this...
Definition: String.cc:37
std::vector< RowComponent * > m_row_components
std::vector< Column * > m_columns
Po::typed_value< String > * str(String *v=0)
Definition: Properties.h:166
int32_t random_int32(int32_t maximum)
Generate random 32-bit integer.
const char * column_qualifier
Definition: Cell.h:68
#define HT_FATAL(msg)
Definition: Logger.h:339
std::vector< RowComponentSpec > m_row_component_specs
std::vector< ColumnSpec > m_column_specs
bool has(const String &name)
Check existence of a configuration value.
Definition: Config.h:57
#define HT_ASSERT(_e_)
Definition: Logger.h:396
std::shared_ptr< Properties > PropertiesPtr
Definition: Properties.h:447
Compatibility Macros for C/C++.
const char * row_key
Definition: Cell.h:66
DataGeneratorIterator(DataGenerator *generator)
Hypertable definitions
#define HT_FATALF(msg,...)
Definition: Logger.h:343
int parse_order(const std::string &str)
Provides an STL-style iterator on DataGenerator objects.
const char * column_family
Definition: Cell.h:67
DataGenerator(PropertiesPtr &props, bool keys_only=false)
long unsigned int Lu
Shortcut for printf formats.
Definition: String.h:47
uint32_t value_len
Definition: Cell.h:72
#define HT_THROW(_code_, _msg_)
Definition: Error.h:478
const uint8_t * value
Definition: Cell.h:71