0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
bmzip.c
Go to the documentation of this file.
1 
22 #include "Common/compat-c.h"
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
27 #include <limits.h>
28 #include <stdint.h>
29 #include <stdarg.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <sys/stat.h>
33 #ifndef HT_NO_MMAP
34 # include <sys/mman.h>
35 static int s_no_mmap = 0;
36 #else
37 static int s_no_mmap = 1;
38 #endif
39 
40 #include "bmz-internal.h"
41 
42 #pragma GCC diagnostic ignored "-Wpedantic"
43 
44 #define BMZ_MAGIC "BMZ"
45 #define BMZIP_VER 0x0110
46 /* The following line generates C90 warnings:
47  * #define BMZ_HEADER_SZ (strlen(BMZ_MAGIC) + 2 + 1 + 6 + 4)
48  */
49 #define BMZ_HEADER_SZ (3 + 2 + 1 + 6 + 4)
50 
51 #define BMZ_A_PACK 0
52 #define BMZ_A_UNPACK 1
53 #define BMZ_A_LIST 2
54 
55 #define BMZ_O_BM_ONLY 1
56 #define BMZ_O_STREAM 2 /* TODO */
57 
58 typedef unsigned char Byte;
59 
60 /* To silence warnings in format strings */
61 typedef long long unsigned Llu;
62 typedef long unsigned Lu;
63 
64 static int s_verbosity = 0;
65 static int s_bm_dump = 0;
66 static int s_bm_hash = 0;
67 
68 #define LOG(_lvl_, _fmt_, ...) if (s_verbosity >= _lvl_) do { \
69  fprintf(stderr, "bmzip: %s: " _fmt_, __FUNCTION__, ##__VA_ARGS__); \
70  if (errno) fprintf(stderr, ": %s", strerror(errno)); \
71  putc('\n', stderr); \
72 } while (0)
73 
74 #define WARN(_fmt_, ...) do { \
75  LOG(0, "warning: " _fmt_, ##__VA_ARGS__); \
76 } while (0)
77 
78 #define DIE(_fmt_, ...) do { \
79  LOG(0, "fatal: " _fmt_, ##__VA_ARGS__); \
80  exit(1); \
81 } while (0)
82 
83 #define BMZ_ALIGN(_mem_, _n_) (Byte *)(_mem_) + _n_ - (((size_t)(_mem_))%(_n_))
84 
85 #define BMZ_READ_INT16(_p_, _n_) \
86  _n_ = (*_p_++ << 8); \
87  _n_ |= (*_p_++)
88 
89 #define BMZ_READ_INT32(_p_, _n_) \
90  _n_ = (*_p_++ << 24); \
91  _n_ |= (*_p_++ << 16); \
92  _n_ |= (*_p_++ << 8); \
93  _n_ |= (*_p_++)
94 
95 #define BMZ_READ_INT48(_p_, _n_) \
96  _n_ = ((uint64_t)*_p_++ << 40); \
97  _n_ |= ((uint64_t)*_p_++ << 32); \
98  _n_ |= (*_p_++ << 24); \
99  _n_ |= (*_p_++ << 16); \
100  _n_ |= (*_p_++ << 8); \
101  _n_ |= (*_p_++)
102 
103 #define BMZ_WRITE_INT16(_p_, _n_) \
104  *_p_++ = (Byte)(_n_ >> 8); \
105  *_p_++ = (Byte)(_n_)
106 
107 #define BMZ_WRITE_INT32(_p_, _n_) \
108  *_p_++ = (Byte)(_n_ >> 24); \
109  *_p_++ = (Byte)(_n_ >> 16); \
110  *_p_++ = (Byte)(_n_ >> 8); \
111  *_p_++ = (Byte)(_n_)
112 
113 #define BMZ_WRITE_INT48(_p_, _n_) \
114  *_p_++ = (Byte)(_n_ >> 40); \
115  *_p_++ = (Byte)(_n_ >> 32); \
116  *_p_++ = (Byte)(_n_ >> 24); \
117  *_p_++ = (Byte)(_n_ >> 16); \
118  *_p_++ = (Byte)(_n_ >> 8); \
119  *_p_++ = (Byte)(_n_)
120 
121 static void
122 read_bmz_header(int fd, Byte *buf) {
123  if (read(fd, buf, BMZ_HEADER_SZ) != BMZ_HEADER_SZ)
124  DIE("error reading bmz file header (%lu bytes)", (Lu)BMZ_HEADER_SZ);
125 }
126 
127 static void
128 parse_bmz_header(const Byte *buf, uint16_t *version_p, uint64_t *orig_size_p,
129  uint32_t *checksum_p, uint32_t *options) {
130  const Byte *bp = buf;
131  size_t magic_len = strlen(BMZ_MAGIC);
132 
133  if (memcmp(buf, BMZ_MAGIC, magic_len)) {
134  DIE("bad magic in file header (%lu bytes)", (Lu)magic_len);
135  }
136  bp += magic_len;
137  BMZ_READ_INT16(bp, *version_p);
138 
139  if (*version_p > BMZIP_VER)
140  DIE("incomaptible version: %04x", *version_p);
141 
142  *options = *bp++;
143  BMZ_READ_INT48(bp, *orig_size_p);
144  BMZ_READ_INT32(bp, *checksum_p);
145 }
146 
147 static void
148 write_bmz_header(int fd, size_t in_len, uint32_t checksum, Byte options) {
149  char buf[BMZ_HEADER_SZ], *bp = buf;
150  uint64_t orig_size = in_len;
151 
152  strcpy(buf, BMZ_MAGIC);
153  bp += strlen(BMZ_MAGIC);
155  *bp++ = options;
156  BMZ_WRITE_INT48(bp, orig_size);
157  BMZ_WRITE_INT32(bp, checksum);
158 
159  if (write(fd, buf, BMZ_HEADER_SZ) != BMZ_HEADER_SZ)
160  DIE("error writing header (%lu bytes)", (Lu)BMZ_HEADER_SZ);
161 }
162 
163 static void
164 do_list(int fd) {
165  Byte buf[BMZ_HEADER_SZ];
166  uint16_t version;
167  uint64_t orig_size, size;
168  uint32_t checksum, options;
169  struct stat st;
170 
171  if (fstat(fd, &st) != 0) DIE("error getting stat from file (%d)", fd);
172 
173  size = st.st_size;
174  read_bmz_header(fd, buf);
175  parse_bmz_header(buf, &version, &orig_size, &checksum, &options);
176  printf("%8s%16s%16s%8s\n", "version", "compressed", "uncompressed", "ratio");
177  printf(" %04x%16llu%16llu%7.2f%%\n", version, (Llu)size,
178  (Llu)orig_size, orig_size ? size * 100. / orig_size : 1);
179 }
180 
181 static void
182 do_pack(const void *in, size_t in_len, size_t buf_len,
183  size_t offset, size_t fp_len, Byte options) {
184  size_t buflen = bmz_pack_buflen(in_len), out_len = buflen;
185  size_t worklen = bmz_pack_worklen(in_len, fp_len);
186  int ret, bm_only = (options & BMZ_O_BM_ONLY) || s_bm_dump;
187  Byte *out, *work_mem;
188 
189  if (bm_only) {
190  out_len = in_len + 1;
191 
192  if (buf_len > in_len + worklen) {
193  out = (Byte *)in + in_len;
194  work_mem = out + out_len;
195  }
196  else {
197  out = malloc(worklen); /* bmz_pack_worklen includes out_len for bm */
198 
199  if (!out)
200  DIE("error allocating %lu bytes memory", (Lu)worklen);
201 
202  work_mem = out + out_len;
203  }
204  /* calling internal API need to align work memory */
205  work_mem = BMZ_ALIGN(work_mem, 8);
206  }
207  else if (buf_len > buflen + worklen) {
208  work_mem = (Byte *)in + buflen;
209  out = (Byte *)in; /* inplace */
210  }
211  else {
212  out = malloc(buflen + worklen);
213 
214  if (!out)
215  DIE("error allocating %lu bytes memory", (Lu)buflen + worklen);
216 
217  work_mem = out + buflen;
218  }
219 
220  if (bm_only) {
221  ret = bmz_bm_pack_mask(in, in_len, out, &out_len, offset, fp_len,
222  work_mem, 257);
223  if (ret != BMZ_E_OK)
224  DIE("error encoding bm output (error %d)", ret);
225 
226  if (s_bm_dump) {
227  if ((ret = bmz_bm_dump(out, out_len)) != BMZ_E_OK)
228  WARN("error dumping bm encoding (ret=%d)", ret);
229 
230  return;
231  }
232  }
233  else if ((ret = bmz_pack(in, in_len, out, &out_len, offset, fp_len,
234  (s_bm_hash << 24), work_mem))
235  != BMZ_E_OK) {
236  DIE("error compressing input (error %d)", ret);
237  }
238  write_bmz_header(1, in_len, bmz_checksum(out, out_len), options);
239  write(1, out, out_len);
240 }
241 
242 static void
243 do_unpack(const void *in, size_t in_len, size_t buf_len) {
244  const Byte *bp = (Byte *)in;
245  uint16_t version;
246  uint64_t orig_size;
247  uint32_t checksum, cs, options;
248  size_t outlen, worklen, len = in_len - BMZ_HEADER_SZ;
249  Byte *out, *workmem;
250  int ret;
251 
252  if (in_len < BMZ_HEADER_SZ) DIE("file truncated (size: %lu)", (Lu)in_len);
253 
254  parse_bmz_header(bp, &version, &orig_size, &checksum, &options);
255 
256  if (orig_size > INT_MAX && sizeof(size_t) == 4)
257  DIE("original file size %llu requires 64-bit version of bmzip",
258  (Llu)orig_size);
259 
260  bp += BMZ_HEADER_SZ;
261  buf_len -= BMZ_HEADER_SZ;
262  cs = bmz_checksum(bp, len);
263  outlen = orig_size;
264 
265  if (cs != checksum)
266  DIE("checksum mismatch (expecting %x, got %x).", checksum, cs);
267 
268  if (options & BMZ_O_BM_ONLY) {
269  out = buf_len > in_len + orig_size ? (Byte*)bp + len : malloc(outlen);
270 
271  if ((ret = bmz_bm_unpack(bp, len, out, &outlen)) != BMZ_E_OK)
272  DIE("error decoding bm input (error %d)", ret);
273  }
274  else {
275  worklen = bmz_unpack_worklen(orig_size > len ? orig_size : len);
276  out = (buf_len > outlen + worklen) ? (Byte *)bp : malloc(outlen + worklen);
277  workmem = out + outlen;
278 
279  if ((ret = bmz_unpack(bp, len, out, &outlen, workmem)) != BMZ_E_OK)
280  DIE("error decompressing (error %d)", ret);
281  }
282  if (orig_size != outlen)
283  WARN("size mismatch (expecting %llu, got %llu)",
284  (Llu)orig_size, (Llu)outlen);
285 
286  write(1, out, outlen);
287 }
288 
289 static void
290 do_block(const void *in, size_t len, size_t buf_len, size_t offset,
291  size_t fp_len, int action, int options) {
292  switch (action) {
293  case BMZ_A_PACK:
294  do_pack(in, len, buf_len, offset, fp_len, options);
295  break;
296  case BMZ_A_UNPACK:
297  do_unpack(in, len, buf_len);
298  break;
299  default:
300  DIE("unknown action: %d", action);
301  }
302 }
303 
304 static char *
305 read_from_fp(FILE *fp, size_t *len_p, size_t *size_p) {
306  char *data = NULL;
307  char buf[65536];
308  int64_t len = 0, size = 0, ret;
309 
310  while ((ret = fread(buf, 1, sizeof(buf), fp)) > 0) {
311  len += ret;
312  if (len > INT_MAX)
313  DIE("reading from stdin for data size greater than 2GB "
314  "not yet supported (current size: %lld)", (long long)len);
315 
316  if (len > size) {
317  size = (len + 16) * 5 / 2;
318  data = realloc(data, size);
319  }
320  memcpy(data + len - ret, buf, ret);
321  }
322  *len_p = len;
323  *size_p = size;
324  return data;
325 }
326 
327 static char *
328 read_from_fd(int fd, size_t *len_p, size_t *size_p) {
329  struct stat st;
330  void *data = NULL;
331  size_t sz;
332 
333  if (fstat(fd, &st) != 0) DIE("cannot stat fd <%d>", fd);
334 
335  if (st.st_size > INT_MAX && sizeof(size_t) == 4)
336  DIE("file size %llu requires 64-bit version of bmzip",
337  (Llu)st.st_size);
338 
339  sz = *len_p = *size_p = st.st_size;
340 
341  if (!sz) return data;
342 
343  if (!s_no_mmap) {
344 #ifndef HT_NO_MMAP
345  LOG(1, "mmapping file (size: %lu)...", (Lu)sz);
346  data = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
347 
348  if (!data || (void *)-1 == data) {
349  LOG(1, "mmap failed on fd %d", fd);
350  errno = 0;
351  LOG(1, "%s", "trying alternative");
352  data = NULL;
353  }
354 #endif
355  }
356  if (!data) {
357  LOG(1, "reading file (size: %lu) into memory...", (Lu)sz);
358  data = malloc(sz);
359 
360  if (!data) DIE("cannot allocate %lu bytes memory", (Lu)sz);
361 
362  if (read(fd, data, sz) != sz) DIE("error reading %lu bytes", (Lu)sz);
363  }
364 
365  return data;
366 }
367 
368 static void
369 input_from_stdin(size_t offset, size_t fp_len, int action, int options) {
370  size_t len, buf_len;
371 
372  if (action == BMZ_A_LIST) {
373  do_list(0);
374  }
375  else {
376  void *data = read_from_fp(stdin, &len, &buf_len);
377  do_block(data, len, buf_len, offset, fp_len, action, options);
378  }
379 }
380 
381 static void
382 input_from_file(const char *fname, size_t offset, size_t fp_len, int action,
383  int options) {
384  size_t len, buf_len;
385  int fd = open(fname, O_RDONLY, 0);
386 
387  if (fd == -1) DIE("cannot open '%s'", fname);
388 
389  if (action == BMZ_A_LIST) {
390  do_list(fd);
391  }
392  else {
393  void *data = read_from_fd(fd, &len, &buf_len);
394  do_block(data, len, buf_len, offset, fp_len, action, options);
395  }
396  /* close and free etc. are omitted intentionally */
397 }
398 
399 static int
400 bm_hash(const char *name) {
401 
402  if (!strcmp("mod", name)) return BMZ_HASH_MOD;
403  else if (!strcmp("mod16x2", name)) return BMZ_HASH_MOD16X2;
404  else if (!strcmp("mask16x2", name)) return BMZ_HASH_MASK16X2;
405  else if (!strcmp("mask", name)) return BMZ_HASH_MASK;
406  else if (!strcmp("mask32x2", name)) return BMZ_HASH_MASK32X2;
407 
408  DIE("unknown hash: %s", name);
409  return 0;
410 }
411 
412 static void HT_NORETURN
414  fprintf(stderr, "%s%s", /* c89 string literal limit is 509 */
415  "usage: bmzip [options] [<file>]\n"
416  "-d, --decompress decompress to stdout\n"
417  "--verbose[=level] show some diagnostic messages\n"
418  "-l, --list list compressed file info\n"
419  "-h, --help show this message\n"
420  "--offset <number> expert: bm encoding start offset\n"
421  "--fp-len <number> expert: bm encoding fingerprint size\n"
422  "--bm-thresh <number> expert: bm hash collision threshold\n",
423  "--bm-hash <name> expert: use <name> as bm hash\n"
424  "--bm-only expert: skip lz compression\n"
425  "--bm-dump expert: dump human readable bm encoding\n"
426  "--no-mmap expert: do not use mmap\n");
427  exit(0);
428 }
429 
430 int
431 main(int ac, char *av[]) {
432  char **ia = av + 1, **a_end = av + ac;
433  /* defaults */
434  size_t fp_len = 64, offset = 0;
435  int bm_thresh = 0, action = BMZ_A_PACK, options = 0;
436 
437  for (; ia < a_end; ++ia) {
438  if (!strcmp("-d", *ia) ||
439  !strcmp("--decompress", *ia)) action = BMZ_A_UNPACK;
440  else if (!strcmp("--verbose", *ia)) s_verbosity = 1;
441  else if (!strcmp("--verbose=", *ia)) s_verbosity = atoi(*ia + 9);
442  else if (!strcmp("--offset", *ia)) offset = atoi(*++ia);
443  else if (!strcmp("--fp-len", *ia)) fp_len = atoi(*++ia);
444  else if (!strcmp("--bm-only", *ia)) options |= BMZ_O_BM_ONLY;
445  else if (!strcmp("--bm-dump", *ia)) s_bm_dump = 1;
446  else if (!strcmp("--no-mmap", *ia)) s_no_mmap = 1;
447  else if (!strcmp("--bm-thresh", *ia)) bm_thresh = atoi(*++ia);
448  else if (!strcmp("--bm-hash", *ia)) s_bm_hash = bm_hash(*++ia);
449  else if (!strcmp("-l", *ia) ||
450  !strcmp("--list", *ia)) action = BMZ_A_LIST;
451  else if (!strcmp("-h", *ia) ||
452  !strcmp("--help", *ia)) {
453  show_usage();
454  }
455  else if (!strcmp("--version", *ia)) {
456  LOG(0, "version %d.%d.%d.%d", BMZIP_VER >> 12, (BMZIP_VER >> 8) & 0xf,
457  (BMZIP_VER >> 4) & 0xf, BMZIP_VER & 0xf);
458  exit(0);
459  }
460  else if (!strcmp("--", *ia)) {
461  ++ia;
462  break;
463  }
464  else if ('-' == **ia)
465  DIE("unknown option: %s\n", *ia);
466  else break;
467  }
468  if (s_verbosity)
470 
471  if (bm_thresh)
472  bmz_set_collision_thresh(bm_thresh);
473 
474  if (ia >= a_end)
475  input_from_stdin(offset, fp_len, action, options);
476  else
477  input_from_file(*ia, offset, fp_len, action, options);
478 
479  return 0;
480 }
int bmz_pack(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem)
Perform bmz compression.
Definition: bmz.c:1254
long long unsigned Llu
Definition: bmzip.c:61
#define BMZ_O_BM_ONLY
Definition: bmzip.c:55
static void do_list(int fd)
Definition: bmzip.c:164
#define BMZ_WRITE_INT48(_p_, _n_)
Definition: bmzip.c:113
bmz_bm_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p)
Definition: bmz.c:1338
#define BMZ_MAGIC
Definition: bmzip.c:44
#define BMZ_WRITE_INT32(_p_, _n_)
Definition: bmzip.c:107
int main(int ac, char *av[])
Definition: bmzip.c:431
#define HT_NORETURN
Definition: compat-c.h:60
static int s_verbosity
Definition: bmzip.c:64
#define BMZ_HASH_MASK
Definition: bmz-internal.h:31
long unsigned Lu
Definition: bmzip.c:62
static void read_bmz_header(int fd, Byte *buf)
Definition: bmzip.c:122
#define BMZ_ALIGN(_mem_, _n_)
Definition: bmzip.c:83
unsigned char Byte
Definition: bmzip.c:58
static void input_from_file(const char *fname, size_t offset, size_t fp_len, int action, int options)
Definition: bmzip.c:382
int bmz_set_verbosity(int verbosity)
Set the verbosity of library for testing and debugging.
Definition: bmz.c:246
#define BMZ_READ_INT16(_p_, _n_)
Definition: bmzip.c:85
#define BMZIP_VER
Definition: bmzip.c:45
#define DIE(_fmt_,...)
Definition: bmzip.c:78
#define BMZ_READ_INT32(_p_, _n_)
Definition: bmzip.c:89
static int bm_hash(const char *name)
Definition: bmzip.c:400
#define BMZ_HASH_MASK16X2
Definition: bmz-internal.h:30
long long unsigned Llu
Definition: bmz.c:90
static void do_unpack(const void *in, size_t in_len, size_t buf_len)
Definition: bmzip.c:243
long unsigned Lu
Definition: bmz-test.c:39
uint8_t Byte
Definition: bmz.c:83
static void HT_NORETURN show_usage()
Definition: bmzip.c:413
#define BMZ_A_PACK
Definition: bmzip.c:51
bmz_set_collision_thresh(int thresh)
Definition: bmz.c:260
#define BMZ_HASH_MOD16X2
Definition: bmz-internal.h:29
int bmz_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p, void *work_mem)
Perform bmz decompression.
Definition: bmz.c:1280
static void do_pack(const void *in, size_t in_len, size_t buf_len, size_t offset, size_t fp_len, Byte options)
Definition: bmzip.c:182
static char * read_from_fd(int fd, size_t *len_p, size_t *size_p)
Definition: bmzip.c:328
#define BMZ_E_OK
Definition: bmz.h:32
size_t bmz_pack_buflen(size_t in_len)
Compute bmz compression output buffer length.
Definition: bmz.c:1164
#define BMZ_WRITE_INT16(_p_, _n_)
Definition: bmzip.c:103
size_t bmz_unpack_worklen(size_t out_len)
Return size of work memory for bmz decompression.
Definition: bmz.c:1192
#define LOG(_lvl_, _fmt_,...)
Definition: bmzip.c:68
static void do_block(const void *in, size_t len, size_t buf_len, size_t offset, size_t fp_len, int action, int options)
Definition: bmzip.c:290
static int s_no_mmap
Copyright (C) 2007-2015 Hypertable, Inc.
Definition: bmzip.c:35
bmz_bm_dump(const void *in, size_t in_len)
Definition: bmz.c:1396
static void write_bmz_header(int fd, size_t in_len, uint32_t checksum, Byte options)
Definition: bmzip.c:148
#define BMZ_A_UNPACK
Definition: bmzip.c:52
static void parse_bmz_header(const Byte *buf, uint16_t *version_p, uint64_t *orig_size_p, uint32_t *checksum_p, uint32_t *options)
Definition: bmzip.c:128
#define BMZ_A_LIST
Definition: bmzip.c:53
#define BMZ_HASH_MASK32X2
Definition: bmz-internal.h:32
static char * read_from_fp(FILE *fp, size_t *len_p, size_t *size_p)
Definition: bmzip.c:305
unsigned bmz_checksum(const void *in, size_t in_len)
A fast checksum (adler32) function that might be useful.
Definition: bmz.c:1473
bmz_bm_pack_mask(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, void *work_mem, size_t b)
Definition: bmz.c:1136
Required portability definitions for all .cc files.
#define BMZ_HASH_MOD
Definition: bmz-internal.h:28
static int s_bm_dump
Definition: bmzip.c:65
static void input_from_stdin(size_t offset, size_t fp_len, int action, int options)
Definition: bmzip.c:369
#define BMZ_READ_INT48(_p_, _n_)
Definition: bmzip.c:95
#define WARN(_fmt_,...)
Definition: bmzip.c:74
size_t bmz_pack_worklen(size_t in_len, size_t fp_len)
Return size of work memory for bmz compression.
Definition: bmz.c:1186
#define BMZ_HEADER_SZ
Definition: bmzip.c:49
static int s_bm_hash
Definition: bmzip.c:66