0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
prune_tsv.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 3
9  * of the License.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  * 02110-1301, USA.
20  */
21 
22 #include <Common/Compat.h>
23 
24 #include <AsyncComm/Config.h>
25 
26 #include <Common/Init.h>
27 #include <Common/Logger.h>
28 
29 #include <cctype>
30 #include <cstdlib>
31 #include <cstring>
32 #include <ctime>
33 #include <iostream>
34 #include <string>
35 
36 using namespace Hypertable;
37 using namespace Hypertable::Config;
38 using namespace std;
39 
40 namespace {
41 
42  const char *usage =
43  "\nusage: prune_tsv [options] <past-date-offset>\n\n"
44  "description:\n"
45  " This program removes lines read from stdin that contain a timestamp\n"
46  " or date string representing a date that is older than the current time\n"
47  " minus <past-date-offset>. The <past-date-offset> argument can be specified\n"
48  " as days, months, or years (examples: 1y, 6m, 21d). The --field option\n"
49  " can be used to select the tab delimted field to search (default == 0). If\n"
50  " the field contains all digits, then it is interpreted as nanoseconds since\n"
51  " the epoch (or seconds if --seconds option is supplied). Otherwise, the\n"
52  " field is searched for the pattern YYYY-MM-DD, which is taken to be the date.\n\n"
53  "options";
54 
55  struct AppPolicy : Policy {
56  static void init_options() {
57  cmdline_desc(usage).add_options()
58  ("field", i32()->default_value(0), "Field number of each line to parse")
59  ("newer", boo()->zero_tokens()->default_value(false),
60  "Remove lines that are newer than calculated cutoff date")
61  ("seconds", boo()->zero_tokens()->default_value(false),
62  "Interpret all-digit fields as seconds instead of nanoseconds")
63  ("zhack", boo()->zero_tokens()->default_value(false), "")
64  ;
65  cmdline_hidden_desc().add_options()("past-date-offset", str(), "");
66  cmdline_positional_desc().add("past-date-offset", -1);
67  }
68  static void init() {
69  if (!has("past-date-offset")) {
70  cout << cmdline_desc() << endl;
71  quick_exit(EXIT_FAILURE);
72  }
73  }
74  };
75 
76  typedef Meta::list<AppPolicy, DefaultCommPolicy> Policies;
77 
78  inline char *get_field(char *line, int fieldno, char **endptr) {
79  char *ptr, *base = line;
80 
81  ptr = strchr(base, '\t');
82  while (fieldno && ptr) {
83  fieldno--;
84  base = ptr+1;
85  ptr = strchr(base, '\t');
86  }
87 
88  if (fieldno > 0)
89  return 0;
90 
91  if (ptr) {
92  *ptr = 0;
93  *endptr = ptr;
94  }
95  return base;
96  }
97 
98  inline const char *find_date(const char *str, bool *formattedp) {
99  const char *slash = str;
100  bool alldigits = true;
101 
102  while (*slash != '-' && *slash != '\t' && *slash != '\n' && *slash) {
103  if (!isdigit(*slash))
104  alldigits = false;
105  slash++;
106  }
107 
108  if (*slash != '-') {
109  if (alldigits && slash > str) {
110  *formattedp = false;
111  return str;
112  }
113  return 0;
114  }
115 
116  do {
117  if (slash - str > 4 &&
118  isdigit(*(slash-4)) &&
119  isdigit(*(slash-3)) &&
120  isdigit(*(slash-2)) &&
121  isdigit(*(slash-1)) &&
122  isdigit(*(slash+1)) &&
123  isdigit(*(slash+2)) &&
124  *(slash+3) == '-' &&
125  isdigit(*(slash+4)) &&
126  isdigit(*(slash+5))) {
127  *formattedp = true;
128  return slash - 4;
129  }
130  } while ((slash = strchr(slash+1, '-')) != 0);
131 
132  return 0;
133  }
134 
135  inline time_t find_seconds(const char *str) {
136  const char *period = strchr(str, '.');
137 
138  if (period == 0)
139  return 0;
140 
141  const char *ptr = period;
142  while (ptr > str && isdigit(*(ptr-1)))
143  ptr--;
144 
145  return strtol(ptr, 0, 10);
146  }
147 
148  time_t parse_date_offset(const char *str) {
149  time_t date_offset = 0;
150  char *end;
151  long n = strtol(str, &end, 10);
152 
153  if (end == str || *end == '\0') {
154  cout << "\nERROR: Invalid <past-date-offset> argument: " << str << "\n" << endl;
155  quick_exit(EXIT_FAILURE);
156  }
157 
158  if (*end == 'd' || *end == 'D')
159  date_offset = n * 24 * 60 * 60;
160  else if (*end == 'm' || *end == 'M')
161  date_offset = n * 30 * 24 * 60 * 60;
162  else if (*end == 'y' || *end == 'Y')
163  date_offset = n * 365 * 24 * 60 * 60;
164  else {
165  cout << "\nERROR: Invalid <past-date-offset> argument: " << str << "\n" << endl;
166  quick_exit(EXIT_FAILURE);
167  }
168  return date_offset;
169  }
170 
171 
172 }
173 
177 int main(int argc, char **argv) {
178  string date_offset_str;
179  const char *base;
180  char *end = 0;
181  time_t date_offset, cutoff_time, line_time;
182  struct tm tm;
183  char cutoff[32];
184  int32_t field;
185  bool newer = false;
186  bool seconds = false;
187  bool formatted;
188 
189  char *line_buffer = new char [ 1024 * 1024 ];
190 
191  ios::sync_with_stdio(false);
192 
193  try {
194  init_with_policies<Policies>(argc, argv);
195 
196  field = get_i32("field");
197 
198  date_offset_str = get_str("past-date-offset");
199  date_offset = parse_date_offset(date_offset_str.c_str());
200  newer = get_bool("newer");
201  seconds = get_bool("seconds");
202 
203  cutoff_time = time(0) - date_offset;
204 
205  if (get_bool("zhack")) {
206  time_t line_seconds;
207  while (!cin.eof()) {
208 
209  cin.getline(line_buffer, 1024*1024);
210  if (cin.eof() && cin.gcount() <= 1)
211  break;
212  if ((base = get_field(line_buffer, field, &end)) &&
213  (line_seconds = find_seconds(base)) &&
214  ((!newer && line_seconds < cutoff_time) ||
215  (newer && line_seconds > cutoff_time)))
216  continue;
217  if (end)
218  *end = '\t';
219  cout << line_buffer << "\n";
220  }
221  }
222  else {
223  localtime_r(&cutoff_time, &tm);
224  strftime(cutoff, sizeof(cutoff), "%F", &tm);
225  while (!cin.eof()) {
226  cin.getline(line_buffer, 1024*1024);
227  if (line_buffer[0] == 0)
228  continue;
229  if ((base = get_field(line_buffer, field, &end)) &&
230  (base = find_date(base, &formatted))) {
231  if (formatted) {
232  if ((!newer && memcmp(base, cutoff, 10) < 0) ||
233  (newer && memcmp(base, cutoff, 10) >= 0))
234  continue;
235  }
236  else {
237  if (seconds)
238  line_time = (time_t)strtoll(base, &end, 10);
239  else
240  line_time = (time_t)(strtoll(base, &end, 10) / 1000000000LL);
241  if ((!newer && (line_time < cutoff_time)) ||
242  (newer && (line_time >= cutoff_time)))
243  continue;
244  }
245  }
246  if (end)
247  *end = '\t';
248  cout << line_buffer << "\n";
249  }
250  }
251  cout << flush;
252  quick_exit(EXIT_SUCCESS);
253  }
254  catch (std::exception &e) {
255  cerr << "Error - " << e.what() << endl;
256  exit(EXIT_FAILURE);
257  }
258 
259 }
Declarations for configuration properties.
Interface and base of config policy.
Definition: Config.h:149
void init(int argc, char *argv[], const Desc *desc=NULL)
Initialize with default policy.
Definition: Init.h:95
Po::typed_value< String > * str(String *v=0)
Definition: Properties.h:166
STL namespace.
Desc & cmdline_desc(const char *usage)
A macro which definds global functions like get_bool(), get_str(), get_i16() etc. ...
Definition: Config.cc:72
bool has(const String &name)
Check existence of a configuration value.
Definition: Config.h:57
Po::typed_value< int32_t > * i32(int32_t *v=0)
Definition: Properties.h:178
Logging routines and macros.
Compatibility Macros for C/C++.
Po::typed_value< bool > * boo(bool *v=0)
Definition: Properties.h:162
Initialization helper for applications.
Hypertable definitions
Meta::list< MyPolicy, DefaultPolicy > Policies
int main(int argc, char **argv)
Definition: prune_tsv.cc:177
Desc & cmdline_hidden_desc()
Get the command line hidden options description (for positional options)
Definition: Config.cc:81
PositionalDesc & cmdline_positional_desc()
Get the command line positional options description.
Definition: Config.cc:90