0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
apache_log_load.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 3
9  * of the License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
22 #include <Common/Compat.h>
23 
25 #include <Hypertable/Lib/Client.h>
26 #include <Hypertable/Lib/KeySpec.h>
27 
28 #include <Common/Error.h>
29 #include <Common/System.h>
30 
31 #include <cstdio>
32 #include <cstring>
33 #include <iostream>
34 
35 using namespace Hypertable;
36 using namespace std;
37 
38 namespace {
39 
44  String extract_page(char *request) {
45  String retstr;
46  const char *base, *ptr;
47  if (!strncmp(request, "GET ", 4)) {
48  base = request + 4;
49  if ((ptr = strchr(base, ' ')) != 0)
50  retstr = String(base, ptr-base);
51  else
52  retstr = base;
53  }
54  else
55  retstr = "-";
56  return retstr;
57  }
58 
64  String format_timestamp(struct tm tm) {
65  return format("%d-%02d-%02d %02d:%02d:%02d", tm.tm_year+1900, tm.tm_mon+1,
66  tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
67  }
68 
72  void report_error(Exception &e) {
73  cerr << e << endl;
74  }
75 
79  void handle_mutation_failure(TableMutatorPtr &mutator_ptr) {
80  std::vector<std::pair<Cell, int> > failed_mutations;
81 
82  mutator_ptr->get_failed(failed_mutations);
83  if (!failed_mutations.empty()) {
84  for (size_t i=0; i<failed_mutations.size(); i++) {
85  cerr << "Failed: (" << failed_mutations[i].first.row_key << ","
86  << failed_mutations[i].first.column_family;
87  if (failed_mutations[i].first.column_qualifier)
88  cerr << ":" << failed_mutations[i].first.column_qualifier;
89  cerr << "," << failed_mutations[i].first.timestamp << ") - "
90  << Error::get_text(failed_mutations[i].second) << endl;
91  }
92  }
93  }
94 
95  const char *usage =
96  "\n"
97  " usage: apache_log_load [--time-order] <file>\n"
98  "\n"
99  " Loads the Apache web log <file> into the LogDb\n"
100  " table. By default, the row key is constructed\n"
101  " as:\n"
102  "\n"
103  " <page> <timestamp>\n"
104  "\n"
105  " This format facilitates queries that return\n"
106  " the click history for a specific page. If\n"
107  " the --time-order switch is supplied, then\n"
108  " the row key is constructed as:\n"
109  "\n"
110  " <timestamp> <page>\n"
111  "\n"
112  " This format facilitates queries that return\n"
113  " a historical portion of the log.\n";
114 
115  const int RETRY_TIMEOUT = 30;
116 
117 }
118 
119 
120 
144 int main(int argc, char **argv) {
145  ApacheLogParser parser;
146  ApacheLogEntry entry;
147  ClientPtr client_ptr;
148  NamespacePtr namespace_ptr;
149  TablePtr table_ptr;
150  TableMutatorPtr mutator_ptr;
151  KeySpec key;
152  const char *inputfile;
153  bool time_order = false;
154  String row;
155 
156  if (argc == 2)
157  inputfile = argv[1];
158  else if (argc == 3 && !strcmp(argv[1], "--time-order")) {
159  time_order = true;
160  inputfile = argv[2];
161  }
162  else {
163  cout << usage << endl;
164  return 0;
165  }
166 
167  try {
168 
169  // Create Hypertable client object
170  client_ptr = make_shared<Client>( System::locate_install_dir(argv[0]) );
171 
172  // Open the root namespace
173  namespace_ptr = client_ptr->open_namespace("/");
174 
175  // Open the 'LogDb' table
176  table_ptr = namespace_ptr->open_table("LogDb");
177 
178  // Create a mutator object on the
179  // 'LogDb' table
180  mutator_ptr.reset(table_ptr->create_mutator());
181 
182  }
183  catch (Exception &e) {
184  report_error(e);
185  return 1;
186  }
187 
188  // Load the log file into the ApacheLogParser
189  // object
190  parser.load(inputfile);
191 
192  // The parser next method will return true
193  // until EOF
194  while (parser.next(entry)) {
195 
196  // Assemble the row key
197  if (time_order) {
198  row = format_timestamp(entry.tm);
199  row += " ";
200  row += extract_page(entry.request);
201  }
202  else {
203  row = extract_page(entry.request);
204  row += " ";
205  row += format_timestamp(entry.tm);
206  }
207 
208  key.row = row.c_str();
209  key.row_len = row.length();
210 
211  try {
212  key.column_family = "ClientIpAddress";
213  mutator_ptr->set(key, entry.ip_address);
214  key.column_family = "UserId";
215  mutator_ptr->set(key, entry.userid);
216  key.column_family = "Request";
217  mutator_ptr->set(key, entry.request);
218  key.column_family = "ResponseCode";
219  mutator_ptr->set(key, entry.response_code);
220  key.column_family = "ObjectSize";
221  mutator_ptr->set(key, entry.object_size);
222  key.column_family = "Referer";
223  mutator_ptr->set(key, entry.referer);
224  key.column_family = "UserAgent";
225  mutator_ptr->set(key, entry.user_agent);
226  }
227  catch (Exception &e) {
228  HT_ERROR_OUT << e << HT_END;
229  do {
230  if (!mutator_ptr->need_retry())
231  quick_exit(EXIT_FAILURE);
232  handle_mutation_failure(mutator_ptr);
233  } while (!mutator_ptr->retry(RETRY_TIMEOUT));
234  }
235  }
236 
237  // Flush pending updates
238  try {
239  mutator_ptr->flush();
240  }
241  catch (Exception &e) {
242  HT_ERROR_OUT << e << HT_END;
243  do {
244  if (!mutator_ptr->need_retry())
245  quick_exit(EXIT_FAILURE);
246  handle_mutation_failure(mutator_ptr);
247  } while (!mutator_ptr->retry(RETRY_TIMEOUT));
248  }
249 
250  return 0;
251 }
Retrieves system information (hardware, installation directory, etc)
std::string String
A String is simply a typedef to std::string.
Definition: String.h:44
String format(const char *fmt,...)
Returns a String using printf like format facilities Vanilla snprintf is about 1.5x faster than this...
Definition: String.cc:37
int main(int argc, char **argv)
This program is designed to parse an Apache web server log and insert the values into a table called ...
bool next(ApacheLogEntry &entry)
STL namespace.
char * user_agent
const void * row
Definition: KeySpec.h:125
char * userid
std::shared_ptr< Namespace > NamespacePtr
Shared smart pointer to Namespace.
Definition: Namespace.h:333
std::shared_ptr< Client > ClientPtr
Definition: Client.h:156
std::shared_ptr< TableMutator > TableMutatorPtr
Smart pointer to TableMutator.
Definition: TableMutator.h:257
struct tm tm
const char * get_text(int error)
Returns a descriptive error message.
Definition: Error.cc:330
Compatibility Macros for C/C++.
char * request
char * object_size
#define HT_END
Definition: Logger.h:220
#define HT_ERROR_OUT
Definition: Logger.h:301
char * response_code
Hypertable definitions
static String locate_install_dir(const char *argv0)
Returns the installation directory.
Definition: System.cc:50
void load(std::string filename)
This is a generic exception class for Hypertable.
Definition: Error.h:314
char * referer
const char * column_family
Definition: KeySpec.h:127
Error codes, Exception handling, error logging.
std::shared_ptr< Table > TablePtr
Definition: Table.h:53
char * ip_address