0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
LoadDataSource.h
Go to the documentation of this file.
1 /* -*- c++ -*-
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; version 3 of the
9  * License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
22 #ifndef Hypertable_Lib_LoadDataSource_h
23 #define Hypertable_Lib_LoadDataSource_h
24 
28 
29 #include <Common/ByteString.h>
30 #include <Common/DynamicBuffer.h>
31 #include <Common/String.h>
32 
33 #include <boost/iostreams/device/file.hpp>
34 #include <boost/iostreams/filtering_stream.hpp>
35 
36 #include <fstream>
37 #include <iostream>
38 #include <memory>
39 #include <string>
40 #include <vector>
41 
42 namespace Hypertable {
43 
44  enum {
48  };
49 
51 
52  public:
53  LoadDataSource(const std::string &header_fname,
54  int row_uniquify_chars = 0,
55  int load_flags = 0);
56 
57  virtual ~LoadDataSource() { delete [] m_type_mask; return; }
58 
59  bool has_timestamps() {
60  return m_leading_timestamps || (m_timestamp_index != -1);
61  }
62 
63  virtual bool next(KeySpec *keyp, uint8_t **valuep, uint32_t *value_lenp,
64  bool *is_deletep, uint32_t *consumedp);
65 
66  virtual void init(const std::vector<String> &key_columns,
67  const std::string &timestamp_column,
68  char field_separator);
69 
70  int64_t get_current_lineno() { return m_cur_line; }
71  unsigned long get_source_size() const { return m_source_size; }
72 
73  protected:
74 
75  bool get_next_line(String &line) {
76  if (m_first_line_cached) {
77  line = m_first_line;
78  m_first_line_cached = false;
79  return true;
80  }
81  getline(m_fin, line);
82  if (m_fin.eof() && line.empty())
83  return false;
84  return true;
85  }
86 
87  virtual void parse_header(const String& header,
88  const std::vector<String> &key_columns,
89  const std::string &timestamp_column);
90  virtual void init_src()=0;
91  virtual uint64_t incr_consumed()=0;
92 
93  bool should_skip(int idx, const uint32_t *masks) {
94  uint32_t bm = masks[idx];
95  return bm && ((bm & TIMESTAMP) ||
97  && (bm & ROW_KEY)));
98  }
99 
101  public:
103  : index(0), width(0), left_justify(false), pad_character(' ') {}
104  void clear() { index=0; width=0; left_justify=false; pad_character=' '; }
105  int index;
106  int width;
109  };
110 
111  enum TypeMask {
112  ROW_KEY = (1 << 0),
113  TIMESTAMP = (1 << 1)
114  };
115 
116  std::string get_header();
117 
118  bool parse_date_format(const char *str, int64_t &timestamp);
119  bool parse_sec(const char *str, char **end_ptr, int64_t &ns);
120  bool add_row_component(int index);
121 
122  struct ColumnInfo {
123  std::string family;
124  std::string qualifier;
125  };
126 
127  std::vector<ColumnInfo> m_column_info;
128  std::vector<const char *> m_values;
129  std::vector<KeyComponentInfo> m_key_comps;
130  uint32_t *m_type_mask;
131  size_t m_next_value;
132  boost::iostreams::filtering_istream m_fin;
133  int64_t m_cur_line;
139  int64_t m_timestamp;
140  size_t m_limit;
141  uint64_t m_offset;
142  bool m_zipped;
144  std::string m_header_fname;
147  std::string m_first_line;
148  unsigned long m_source_size;
151  };
152 
154  typedef std::shared_ptr<LoadDataSource> LoadDataSourcePtr;
155 
156 }
157 
158 #endif // Hypertable_Lib_LoadDataSource_h
bool should_skip(int idx, const uint32_t *masks)
virtual void parse_header(const String &header, const std::vector< String > &key_columns, const std::string &timestamp_column)
std::string String
A String is simply a typedef to std::string.
Definition: String.h:44
virtual void init_src()=0
Po::typed_value< String > * str(String *v=0)
Definition: Properties.h:166
LoadDataSource(const std::string &header_fname, int row_uniquify_chars=0, int load_flags=0)
virtual void init(const std::vector< String > &key_columns, const std::string &timestamp_column, char field_separator)
A dynamic, resizable and reference counted memory buffer.
Definition: DynamicBuffer.h:42
std::vector< KeyComponentInfo > m_key_comps
unsigned long get_source_size() const
A dynamic, resizable memory buffer.
std::vector< ColumnInfo > m_column_info
boost::iostreams::filtering_istream m_fin
bool get_next_line(String &line)
std::vector< const char * > m_values
bool parse_date_format(const char *str, int64_t &timestamp)
Hypertable definitions
bool add_row_component(int index)
virtual uint64_t incr_consumed()=0
FixedRandomStringGenerator * m_rsgen
bool duplicate_key_columns(int flags)
Definition: LoadDataFlags.h:35
A String class based on std::string.
A serializable ByteString.
virtual bool next(KeySpec *keyp, uint8_t **valuep, uint32_t *value_lenp, bool *is_deletep, uint32_t *consumedp)
std::shared_ptr< LoadDataSource > LoadDataSourcePtr
Smart pointer to LoadDataSource.
bool parse_sec(const char *str, char **end_ptr, int64_t &ns)