0.9.8.10
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
bmz.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2007-2015 Hypertable, Inc.
3  *
4  * This file is part of Hypertable.
5  *
6  * Hypertable is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; version 3 of the
9  * License, or any later version.
10  *
11  * Hypertable is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Hypertable. If not, see <http://www.gnu.org/licenses/>
18  */
19 
20 /*
21  * An effective/efficient block compressor for input containing long common
22  * strings (e.g. web pages from a website)
23  *
24  * cf. Bentley & McIlroy, "Data Compression Using Long Common Strings", 1999
25  * cf. BMDiff & Zippy mentioned in the Bigtable paper
26  */
27 
28 /* Avoid platform specific stuff in this file */
29 #include <stdio.h>
30 
31 #include <stdlib.h>
32 #include <stdint.h>
33 #include <string.h>
34 #include <math.h>
35 #include <time.h>
36 
37 #include "bmz-internal.h"
38 #include "ThirdParty/lzo/minilzo.h"
39 
40 #pragma GCC diagnostic ignored "-Wpedantic"
41 
42 /* Initial bases for computing Rabin Karp rolling hash */
43 #define BM_B1 257
44 #define BM_B2 277
45 #define BM_M1 0xffff
46 #define BM_M2 (0xffff - 4)
47 
48 #define BM_MASK16 0xffff
49 #define BM_MASK24 0xffffff
50 #define BM_MASK32 0xffffffff
51 #define BM_MASKSZ BM_MASK32
52 
53 /* Escape character
54  * Don't change without understanding BM_DECODE_POS
55  */
56 #define BM_ESC 0xfe
57 #define BM_MAX_LEN 0xfdffffffffffull /* 253TB block should be big enough */
58 #define BM_MAX_1B 0xfd
59 #define BM_MAX_2B 0xfdff
60 #define BM_MAX_3B 0xfdffff
61 #define BM_MAX_4B 0xfdfffffful
62 #define BM_MAX_5B 0xfdffffffffull
63 
64 /* VInt limits */
65 #define BM_MAX_V1B 0x7f
66 #define BM_MAX_V2B 0x3fff
67 #define BM_MAX_V3B 0x1fffff
68 #define BM_MAX_V4B 0xfffffff
69 #define BM_MAX_V5B 0x7ffffffffull
70 #define BM_MAX_V6B 0x3ffffffffffull
71 
72 /* Rolling hash collision thresholds */
73 #define BM_COLLISION_ABORT_THRESH 1
74 #define BM_ABORT_COLLISIONS \
75  ((BM_COLLISION_ABORT_THRESH + 1) * collision_thresh + 1)
76 
77 /* Some colors for debugging/dump output */
78 #define BM_COLOR_DBG "\x1b[31m" /* red */
79 #define BM_COLOR_ALT "\x1b[32m" /* green */
80 #define BM_COLOR_END "\x1b[m"
81 
82 /* May need to do something here, in case stdint.h is not available */
83 typedef uint8_t Byte;
84 typedef uint32_t UInt32;
85 typedef uint64_t UInt64;
86 typedef int64_t Int64;
87 typedef int32_t Int32;
88 
89 /* For printf %llu in case some system has funky headers */
90 typedef long long unsigned Llu;
91 
92 /* For lookup table */
93 #define BM_NPOS ((size_t)-1)
94 
95 /* Aligning memory for work space */
96 #define BM_ALIGN(_mem_, _n_) (Byte *)(_mem_) + _n_ - (((size_t)(_mem_))%(_n_))
97 
98 /* Some prime numbers as candidates for RK hash bases in case of adaptation */
99 static size_t s_primes[] = {
100  3, 5, 7, 11, 13, 17, 19, 23, 29,
101  31, 37, 41, 43, 47, 53, 59, 61, 67, 71,
102  73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
103  127, 131, 137, 139, 149, 151, 157, 163, 167, 173,
104  179, 181, 191, 193, 197, 199, 211, 223, 227, 229,
105  233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
106  283, 293, 307, 311, 313, 317, 331, 337, 347, 349,
107  353, 359, 367, 373, 379, 383, 389, 397, 401, 409,
108  419, 421, 431, 433, 439, 443, 449, 457, 461, 463,
109  467, 479, 487, 491, 499, 503, 509, 521, 523, 541,
110  547, 557, 563, 569, 571, 577, 587, 593, 599, 601,
111  607, 613, 617, 619, 631, 641, 643, 647, 653, 659,
112  661, 673, 677, 683, 691, 701, 709, 719, 727, 733,
113  739, 743, 751, 757, 761, 769, 773, 787, 797, 809,
114  811, 821, 823, 827, 829, 839, 853, 857, 859, 863,
115  877, 881, 883, 887, 907, 911, 919, 929, 937, 941,
116  947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013,
117  1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
118  1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151,
119  1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
120  1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291,
121  1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373,
122  1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451,
123  1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
124  1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583,
125  1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
126  1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733,
127  1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
128  1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889,
129  1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
130  1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053,
131  2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
132  2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213,
133  2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287,
134  2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357,
135  2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
136  2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531,
137  2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617,
138  2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687,
139  2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
140  2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819,
141  2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
142  2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999,
143  3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
144  3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181,
145  3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257,
146  3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331,
147  3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
148  3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511,
149  3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571,
150  3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643,
151  3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
152  3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821,
153  3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
154  3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989,
155  4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
156  4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139,
157  4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231,
158  4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297,
159  4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
160  4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493,
161  4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583,
162  4591, 4597, 4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657,
163  4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
164  4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, 4817, 4831,
165  4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
166  4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003,
167  5009, 5011, 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
168  5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167, 5171, 5179,
169  5189, 5197, 5209, 5227, 5231, 5233, 5237, 5261, 5273, 5279,
170  5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, 5381, 5387,
171  5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
172  5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521,
173  5527, 5531, 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639,
174  5641, 5647, 5651, 5653, 5657, 5659, 5669, 5683, 5689, 5693,
175  5701, 5711, 5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
176  5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, 5851, 5857,
177  5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939,
178  5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053,
179  6067, 6073, 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
180  6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211, 6217, 6221,
181  6229, 6247, 6257, 6263, 6269, 6271, 6277, 6287, 6299, 6301,
182  6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, 6361, 6367,
183  6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
184  6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571,
185  6577, 6581, 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673,
186  6679, 6689, 6691, 6701, 6703, 6709, 6719, 6733, 6737, 6761,
187  6763, 6779, 6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
188  6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, 6911, 6917,
189  6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997,
190  7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103,
191  7109, 7121, 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
192  7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253, 7283, 7297,
193  7307, 7309, 7321, 7331, 7333, 7349, 7351, 7369, 7393, 7411,
194  7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, 7489, 7499,
195  7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
196  7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643,
197  7649, 7669, 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723,
198  7727, 7741, 7753, 7757, 7759, 7789, 7793, 7817, 7823, 7829,
199  7841, 7853, 7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
200  7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, 8011, 8017
201 };
202 
203 /* Hash collision adaption threshold */
204 static size_t s_collision_thresh = 0;
205 
206 /* Logging/messaging facilities */
207 static int s_verbosity = 0;
208 
209 static void
210 builtin_out(const char *buf, size_t len) {
211  fwrite(buf, 1, len, stderr);
212 }
213 
214 static HT_NORETURN void
215 builtin_die(const char *msg) {
216  fputs(msg, stderr);
217  exit(1);
218 }
219 
222 
223 #define BM_LOG(_level_, _fmt_, ...) if (s_verbosity >= _level_) do { \
224  char msg[256]; \
225  int len = snprintf(msg, sizeof(msg), "%s: " _fmt_, __FUNCTION__, \
226  ##__VA_ARGS__); \
227  s_out_proc(msg, len); \
228 } while (0)
229 
230 #define BM_LOG_(_level_, _buf_, _len_) if (s_verbosity >= _level_) do { \
231  s_out_proc((char *)_buf_, _len_); \
232 } while (0)
233 
234 #define BM_DIE(_fmt_, ...) do { \
235  char msg[256]; \
236  snprintf(msg, sizeof(msg), "fatal: %s: " _fmt_ "\n", \
237  __FUNCTION__, ##__VA_ARGS__); \
238  s_die_proc(msg); \
239 } while (0)
240 
241 #define BM_CHECK(_cond_) \
242  if (_cond_); else BM_DIE("%s:%d: expects: %s", \
243  __FILE__, __LINE__, #_cond_)
244 
245 int
246 bmz_set_verbosity(int verbosity) {
247  int old = s_verbosity;
248  s_verbosity = verbosity;
249  return old;
250 }
251 
254  BmzOutProc old = s_out_proc;
255  s_out_proc = proc;
256  return old;
257 }
258 
259 int
261  int old = s_collision_thresh;
262  s_collision_thresh = thresh;
263  return old;
264 }
265 
266 /* Pick one or two primes out of 1009
267  * used for adaptive hashing in face of bad data
268  */
269 static size_t
270 random_prime(size_t excluded) {
271  const size_t n = sizeof(s_primes) / sizeof(size_t);
272  size_t i = 0;
273  /* Don't really care if rand/srand are not thread-safe for
274  * predictable results, as long as it doesn't crash.
275  */
276  static int sranded = 0;
277 
278  if (!sranded) {
279  srand(time(NULL));
280  sranded = 1;
281  }
282 
283  for (; i < n; ++i) {
284  /* random() in os x and bsd is better, but not in glibc yet. */
285  size_t s = s_primes[rand() % n];
286  if (s != excluded) return s;
287  }
288  /* Safety net, though highly unlikely (1/1009^1009) to reach here */
289  return 8111;
290 }
291 
292 static size_t
293 bm_lut_size(size_t n) {
294  UInt64 x = n; /* size_t could be 64-bit */
295 
296  do {
297  /* power of 2 for mask mod */
298  x |= (x >> 1);
299  x |= (x >> 2);
300  x |= (x >> 4);
301  x |= (x >> 8);
302  x |= (x >> 16);
303  x |= (x >> 32);
304  ++x;
305  } while (x < n * 8 / 5); /* avoid too much probes */
306  return x;
307 }
308 
309 typedef struct {
310  size_t hash;
311  size_t pos;
312 } BmLutEntry;
313 
314 typedef struct {
315  size_t size, collisions, adapts, probes;
317 } BmLut;
318 
319 static void
320 init_bmlut(BmLut *table, size_t n, void *work_mem) {
321  BmLutEntry *p, *endp;
322 
323  table->probes = table->adapts = table->collisions = 0;
324  table->size = bm_lut_size(n);
325  table->entries = (BmLutEntry *) work_mem;
326 
327  for (p = table->entries, endp = p + table->size; p < endp; ++p)
328  p->pos = BM_NPOS;
329 
330  BM_LOG(1, "n(fps): %llu, size: %llu\n", (Llu)n, (Llu)table->size);
331 }
332 
333 /* Simple hash lookup with probing */
334 #define BM_LOOKUP(_lut_, _hash_, _entry_) do { \
335  size_t sz = _lut_.size, idx = (_hash_) & (sz - 1), probes = 0; \
336  BmLutEntry *b = _lut_.entries, *p = b + idx, *endp = b + sz; \
337  \
338  while (p->pos != BM_NPOS) { \
339  if (p->hash == _hash_) break; \
340  ++p; \
341  if (p >= endp) p = b; \
342  ++probes; \
343  } \
344  if (probes) _lut_.probes += probes; \
345  _entry_ = p; \
346 } while (0)
347 
348 /* Classical/generic Rabin Karp */
349 #define R_HASH_MOD_BODY(_int_type_) \
350  _int_type_ h = 0, p = 1; \
351  \
352  while (len--) { \
353  h += p * buf[len]; \
354  h %= m; \
355  p *= b; \
356  p %= m; \
357  } \
358  return h
359 
360 static UInt64
361 r_hash_mod(const Byte *buf, size_t len, size_t b, UInt64 m) {
363 }
364 
365 static UInt32
366 r_hash_mod32(const Byte *buf, size_t len, UInt32 b, UInt32 m) {
368 }
369 
370 /* D* rolling hash family (Andrew Trigdell's thesis p72-73) */
371 static UInt32
372 r_hash_mod16x2(const Byte *buf, size_t len, UInt32 b1, UInt32 b2,
373  UInt32 m1, UInt32 m2) {
374  return (r_hash_mod32(buf, len, b1, m1) << 16) |
375  r_hash_mod32(buf, len, b2, m2);
376 }
377 
378 /* Compute (x^n) % m */
379 #define POW_MOD_BODY(_pow_fun_, _int_type_) \
380  if (n == 0) return 1; \
381  if (n == 1) return x % m; \
382  { \
383  _int_type_ sqr = (x * x) % m; \
384  _int_type_ pow = _pow_fun_(sqr, n / 2, m); \
385  if (n & 1) return (pow * x) % m; \
386  return pow % m; \
387  }
388 
389 static UInt64
392 }
393 
394 static UInt32
397 }
398 
399 /* update rolling hash */
400 #define UPDATE_HASH_MOD_BODY \
401  h *= b; \
402  h -= (out * pow_n) % m; \
403  h += in; \
404  if (h < 0) h += m; \
405  return (h % m)
406 
407 static inline Int64
408 update_hash_mod(Int64 h, Byte in, Byte out, Int64 pow_n, Int64 b, Int64 m) {
410 }
411 
412 static inline Int32
413 update_hash_mod32(Int32 h, Byte in, Byte out, Int32 pow_n, Int32 b, Int32 m) {
415 }
416 
417 static inline UInt32
418 update_hash_mod16x2(UInt32 h, int in, int out, UInt32 pow1, UInt32 pow2,
419  UInt32 b1, UInt32 b2, UInt32 m1, UInt32 m2) {
420  return (update_hash_mod32((h >> 16), in, out, pow1, b1, m1) << 16) |
421  update_hash_mod32((h & BM_MASK16), in, out, pow2, b2, m2);
422 }
423 
424 /* Faster hash using mask instead of mod
425  * m needs to be power-of-2
426  */
427 #define R_HASH_MASK_BODY(int_type) \
428  int_type h = 0, p = 1; \
429  \
430  while (len--) { \
431  h += p * buf[len]; \
432  h &= m; \
433  p *= b; \
434  p &= m; \
435  } \
436  return h
437 
438 static UInt64
439 r_hash_mask(const Byte *buf, size_t len, UInt64 b, UInt64 m) {
441 }
442 
443 static UInt32
444 r_hash_mask32(const Byte *buf, size_t len, UInt32 b, UInt32 m) {
446 }
447 
448 /* C* rolling hash family (see D* above) */
449 static UInt32
450 r_hash_mask16x2(const Byte *buf, size_t len, UInt32 b1, UInt32 b2) {
451  return (r_hash_mask32(buf, len, b1, BM_MASK16) << 16) |
452  r_hash_mask32(buf, len, b2, BM_MASK16);
453 }
454 
455 static UInt64
456 r_hash_mask32x2(const Byte *buf, size_t len, UInt64 b1, UInt64 b2) {
457  return (r_hash_mask(buf, len, b1, BM_MASK32) << 32) |
458  r_hash_mask(buf, len, b2, BM_MASK32);
459 }
460 
461 #define POW_MASK_BODY(_pow_fun_, _int_type_) \
462  if (n == 0) return 1; \
463  if (n == 1) return (x & m); \
464  { \
465  _int_type_ sqr = (x * x) & m; \
466  _int_type_ pow = _pow_fun_(sqr, n / 2, m); \
467  if (n & 1) return ((pow * x) & m); \
468  return (pow & m); \
469  }
470 
471 static UInt64
474 }
475 
476 static UInt32
479 }
480 
481 #define UPDATE_HASH_MASK_BODY \
482  h *= b; \
483  h -= (out * pow_n) & m; \
484  h += in; \
485  return (h & m)
486 
487 static inline UInt64
488 update_hash_mask(UInt64 h, int in, int out, UInt64 pow_n, UInt64 b, UInt64 m) {
490 }
491 
492 static inline UInt32
493 update_hash_mask32(UInt32 h, int in, int out, UInt32 pow_n,
494  UInt32 b, UInt32 m) {
496 }
497 
498 static inline UInt32
499 update_hash_mask16x2(UInt32 h, int in, int out, UInt32 pow1, UInt32 pow2,
500  UInt32 b1, UInt32 b2) {
501  return (update_hash_mask32((h >> 16), in, out, pow1, b1, BM_MASK16) << 16) |
502  update_hash_mask32((h & BM_MASK16), in, out, pow2, b2, BM_MASK16);
503 }
504 
505 static inline UInt64
506 update_hash_mask32x2(UInt64 h, int in, int out, UInt64 pow1, UInt64 pow2,
507  UInt64 b1, UInt64 b2) {
508  return (update_hash_mask((h >> 32), in, out, pow1, b1, BM_MASK32) << 32) |
509  update_hash_mask((h & BM_MASK32), in, out, pow2, b2, BM_MASK32);
510 }
511 
512 /* External AP Ifor computing the hashes */
513 size_t
514 bmz_hash_mod(const void *in, size_t in_len, size_t b, size_t m) {
515  return r_hash_mod((Byte *)in, in_len, b, m);
516 }
517 
518 size_t
519 bmz_hash_mod16x2(const void *in, size_t in_len, size_t b1, size_t b2,
520  size_t m1, size_t m2) {
521  return r_hash_mod16x2((Byte *)in, in_len, b1, b2, m1, m2);
522 }
523 
524 size_t
525 bmz_hash_mask16x2(const void *in, size_t in_len, size_t b1, size_t b2) {
526  return r_hash_mask16x2((Byte *)in, in_len, b1, b2);
527 }
528 
529 size_t
530 bmz_hash_mask(const void *in, size_t in_len, size_t b) {
531  return r_hash_mask((Byte *)in, in_len, b, BM_MASKSZ);
532 }
533 
534 size_t
535 bmz_hash_mask32x2(const void *in, size_t in_len, size_t b1, size_t b2) {
536  return r_hash_mask32x2((Byte *)in, in_len, b1, b2);
537 }
538 
539 /* Verify the rolling hashes */
540 #define BM_HASH_CHECK_BODY(_init_hash_, _rehash_, _update_hash_) \
541  size_t hash; \
542  Byte *in = (Byte *)src, *ip = in, *in_end = in + in_len; \
543  int errs = 0; \
544  \
545  BM_CHECK(fp_len < in_len); \
546  _init_hash_; \
547  \
548  for (; ip < in_end - fp_len - 1; ++ip) { \
549  size_t h0 = _rehash_; \
550  hash = _update_hash_; \
551  if (h0 != hash) { \
552  BM_LOG(0, "mismatch at pos %ld: %lx vs %lx\n", \
553  (long)(ip - in), (long)h0, (long)hash); \
554  ++errs; \
555  } \
556  } \
557  BM_LOG(1, "total errors: %d\n", errs); \
558  return errs ? BMZ_E_ERROR : BMZ_E_OK
559 
560 #define BM_HASH_BENCH_BODY(_init_hash_, _update_hash_) \
561  size_t hash; \
562  Byte *in = (Byte *)src, *ip = in, *in_end = in + in_len; \
563  \
564  BM_CHECK(fp_len < in_len); \
565  _init_hash_; \
566  \
567  for (; ip < in_end - fp_len - 1; ++ip) { \
568  hash = _update_hash_; \
569  } \
570  BM_LOG(1, "last hash: %lx\n", (long)hash) /* or the loop gets optimized out */
571 
572 int
573 bmz_check_hash_mod(const void *src, size_t in_len, size_t fp_len,
574  size_t b, size_t m) {
575  size_t pow_n;
576 
577  BM_HASH_CHECK_BODY(hash = r_hash_mod(in, fp_len, b, m);
578  pow_n = pow_mod(b, fp_len, m),
579  r_hash_mod(ip + 1, fp_len, b, m),
580  update_hash_mod(hash, ip[fp_len], *ip, pow_n, b, m));
581 }
582 
583 void
584 bmz_bench_hash_mod(const void *src, size_t in_len, size_t fp_len) {
585  UInt64 b = BM_B1, m = BM_MASKSZ;
586  size_t pow_n;
587 
588  BM_HASH_BENCH_BODY(hash = r_hash_mod(in, fp_len, b, m);
589  pow_n = pow_mod(b, fp_len, m),
590  update_hash_mod(hash, ip[fp_len], *ip, pow_n, b, m));
591 }
592 
593 int
594 bmz_check_hash_mod16x2(const void *src, size_t in_len, size_t fp_len,
595  size_t b1, size_t b2, size_t m1, size_t m2) {
596  size_t pow_n_b1, pow_n_b2;
597 
598  BM_HASH_CHECK_BODY(hash = r_hash_mod16x2(in, fp_len, b1, b2, m1, m2);
599  pow_n_b1 = pow_mod32(b1, fp_len, m1);
600  pow_n_b2 = pow_mod32(b2, fp_len, m2),
601  r_hash_mod16x2(ip + 1, fp_len, b1, b2, m1, m2),
602  update_hash_mod16x2(hash, ip[fp_len], *ip, pow_n_b1,
603  pow_n_b2, b1, b2, m1, m2));
604 }
605 
606 void
607 bmz_bench_hash_mod16x2(const void *src, size_t in_len, size_t fp_len) {
608  size_t pow_n_b1, pow_n_b2, b1 = BM_B1, b2 = BM_B2, m1 = BM_M1, m2 = BM_M2;
609 
610  BM_HASH_BENCH_BODY(hash = r_hash_mod16x2(in, fp_len, b1, b2, m1, m2);
611  pow_n_b1 = pow_mod32(b1, fp_len, m1);
612  pow_n_b2 = pow_mod32(b2, fp_len, m2),
613  update_hash_mod16x2(hash, ip[fp_len], *ip, pow_n_b1,
614  pow_n_b2, b1, b2, m1, m2));
615 }
616 
617 int
618 bmz_check_hash_mask16x2(const void *src, size_t in_len, size_t fp_len,
619  size_t b1, size_t b2) {
620  size_t pow_n_b1, pow_n_b2;
621 
622  BM_HASH_CHECK_BODY(hash = r_hash_mask16x2(in, fp_len, b1, b2);
623  pow_n_b1 = pow_mask32(b1, fp_len, BM_MASK16);
624  pow_n_b2 = pow_mask32(b2, fp_len, BM_MASK16),
625  r_hash_mask16x2(ip + 1, fp_len, b1, b2),
626  update_hash_mask16x2(hash, ip[fp_len], *ip, pow_n_b1,
627  pow_n_b2, b1, b2));
628 }
629 
630 void
631 bmz_bench_hash_mask16x2(const void *src, size_t in_len, size_t fp_len) {
632  size_t pow_n_b1, pow_n_b2, b1 = BM_B1, b2 = BM_B2;
633 
634  BM_HASH_BENCH_BODY(hash = r_hash_mask16x2(in, fp_len, b1, b2);
635  pow_n_b1 = pow_mask32(b1, fp_len, BM_MASK16);
636  pow_n_b2 = pow_mask32(b2, fp_len, BM_MASK16),
637  update_hash_mask16x2(hash, ip[fp_len], *ip, pow_n_b1,
638  pow_n_b2, b1, b2));
639 }
640 
641 int
642 bmz_check_hash_mask(const void *src, size_t in_len, size_t fp_len,
643  size_t b) {
644  size_t pow_n, m = BM_MASKSZ;
645 
646  BM_HASH_CHECK_BODY(hash = r_hash_mask(in, fp_len, b, m);
647  pow_n = pow_mask(b, fp_len, m),
648  r_hash_mask(ip + 1, fp_len, b, m),
649  update_hash_mask(hash, ip[fp_len], *ip, pow_n, b, m));
650 }
651 
652 void
653 bmz_bench_hash_mask(const void *src, size_t in_len, size_t fp_len) {
654  size_t pow_n, b = BM_B1, m = BM_MASKSZ;
655 
656  BM_HASH_BENCH_BODY(hash = r_hash_mask(in, fp_len, b, m);
657  pow_n = pow_mask(b, fp_len, m),
658  update_hash_mask(hash, *ip, ip[fp_len], pow_n, b, m));
659 }
660 
661 int
662 bmz_check_hash_mask32x2(const void *src, size_t in_len, size_t fp_len,
663  size_t b1, size_t b2) {
664  size_t pow_n_b1, pow_n_b2;
665 
666  BM_HASH_CHECK_BODY(hash = r_hash_mask32x2(in, fp_len, b1, b2);
667  pow_n_b1 = pow_mask(b1, fp_len, BM_MASK32);
668  pow_n_b2 = pow_mask(b2, fp_len, BM_MASK32),
669  r_hash_mask32x2(ip + 1, fp_len, b1, b2),
670  update_hash_mask32x2(hash, ip[fp_len], *ip, pow_n_b1,
671  pow_n_b2, b1, b2));
672 }
673 
674 void
675 bmz_bench_hash_mask32x2(const void *src, size_t in_len, size_t fp_len) {
676  size_t pow_n_b1, pow_n_b2, b1 = BM_B1, b2 = BM_B2;
677 
678  BM_HASH_BENCH_BODY(hash = r_hash_mask32x2(in, fp_len, b1, b2);
679  pow_n_b1 = pow_mask(b1, fp_len, BM_MASK32);
680  pow_n_b2 = pow_mask(b2, fp_len, BM_MASK32),
681  update_hash_mask32x2(hash, ip[fp_len], *ip, pow_n_b1,
682  pow_n_b2, b1, b2));
683 }
684 
685 void
686 bmz_bench_hash(const void *in, size_t in_len, unsigned type) {
687  size_t fp_len = 100;
688 
689  switch (type) {
690  case BMZ_HASH_MOD:
691  bmz_bench_hash_mod(in, in_len, fp_len);
692  break;
693  case BMZ_HASH_MOD16X2:
694  bmz_bench_hash_mod16x2(in, in_len, fp_len);
695  break;
696  case BMZ_HASH_MASK16X2:
697  bmz_bench_hash_mask16x2(in, in_len, fp_len);
698  break;
699  case BMZ_HASH_MASK:
700  bmz_bench_hash_mask(in, in_len, fp_len);
701  break;
702  case BMZ_HASH_MASK32X2:
703  bmz_bench_hash_mask32x2(in, in_len, fp_len);
704  }
705 }
706 
707 /* Benchmark lookup table performance */
708 #define BM_LUT_BENCH_BODY(_init_hash_, _update_hash_) \
709  size_t hash; \
710  Byte *in = (Byte *)src, *ip = in, *in_end = in + in_len; \
711  size_t scan_len = in_len - fp_len, c, boundary; \
712  BmLut lut; \
713  BmLutEntry *entry; \
714  \
715  BM_CHECK(fp_len < in_len); \
716  init_bmlut(&lut, in_len / fp_len, work_mem); \
717  _init_hash_; \
718  \
719  for (boundary = fp_len; ip < in_end - fp_len - 1; ++ip) { \
720  size_t pos = ip - in; \
721  BM_LOOKUP(lut, hash, entry); \
722  \
723  if (entry->pos != BM_NPOS) { \
724  if (memcmp(in + entry->pos, ip, fp_len) != 0) ++lut.collisions; \
725  } \
726  if (pos == boundary) { \
727  entry->hash = hash; \
728  entry->pos = pos; \
729  boundary += fp_len; \
730  } \
731  hash = _update_hash_; \
732  } \
733  c = lut.collisions; \
734  BM_LOG(1, "length: %llu, lut.size: %llu, lut.collisions: %llu (eff. bits: " \
735  "%.3f), lut.probes: %llu (%.3f/lu)\n", (Llu)in_len, (Llu)lut.size, \
736  (Llu)c, c ? log((double)in_len / fp_len * scan_len / c) / log(2) \
737  : sizeof(size_t) * 8., \
738  (Llu)lut.probes, (double)lut.probes / scan_len)
739 
740 void
741 bmz_bench_lut_mod(const void *src, size_t in_len, size_t fp_len,
742  void *work_mem, size_t b, size_t m) {
743  size_t pow_n;
744 
745  BM_LUT_BENCH_BODY(hash = r_hash_mod(in, fp_len, b, m);
746  pow_n = pow_mod(b, fp_len, m),
747  update_hash_mod(hash, ip[fp_len], *ip, pow_n, b, m));
748 }
749 
750 void
751 bmz_bench_lut_mod16x2(const void *src, size_t in_len, size_t fp_len,
752  void *work_mem, size_t b1, size_t b2,
753  size_t m1, size_t m2) {
754  size_t pow_n_b1, pow_n_b2;
755 
756  BM_LUT_BENCH_BODY(hash = r_hash_mod16x2(in, fp_len, b1, b2, m1, m2);
757  pow_n_b1 = pow_mod32(b1, fp_len, m1);
758  pow_n_b2 = pow_mod32(b2, fp_len, m2),
759  update_hash_mod16x2(hash, ip[fp_len], *ip, pow_n_b1,
760  pow_n_b2, b1, b2, m1, m2));
761 }
762 
763 void
764 bmz_bench_lut_mask16x2(const void *src, size_t in_len, size_t fp_len,
765  void *work_mem, size_t b1, size_t b2) {
766  size_t pow_n_b1, pow_n_b2;
767 
768  BM_LUT_BENCH_BODY(hash = r_hash_mask16x2(in, fp_len, b1, b2);
769  pow_n_b1 = pow_mask32(b1, fp_len, BM_MASK16);
770  pow_n_b2 = pow_mask32(b2, fp_len, BM_MASK16),
771  update_hash_mask16x2(hash, ip[fp_len], *ip, pow_n_b1,
772  pow_n_b2, b1, b2));
773 }
774 
775 void
776 bmz_bench_lut_mask(const void *src, size_t in_len, size_t fp_len,
777  void *work_mem, size_t b) {
778  size_t pow_n, m = BM_MASKSZ;
779 
780  BM_LUT_BENCH_BODY(hash = r_hash_mask(in, fp_len, b, m);
781  pow_n = pow_mask(b, fp_len, m),
782  update_hash_mask(hash, *ip, ip[fp_len], pow_n, b, m));
783 }
784 
785 void
786 bmz_bench_lut_mask32x2(const void *src, size_t in_len, size_t fp_len,
787  void *work_mem, size_t b1, size_t b2) {
788  size_t pow_n_b1, pow_n_b2;
789 
790  BM_LUT_BENCH_BODY(hash = r_hash_mask32x2(in, fp_len, b1, b2);
791  pow_n_b1 = pow_mask(b1, fp_len, BM_MASK32);
792  pow_n_b2 = pow_mask(b2, fp_len, BM_MASK32),
793  update_hash_mask32x2(hash, ip[fp_len], *ip, pow_n_b1,
794  pow_n_b2, b1, b2));
795 }
796 
797 /* One way to debug macro laden code:
798  * gcc -E % | sed '/^\#/d' | gnuindent -st -i2 -kr > bmzpp.c
799  */
800 
801 #define BM_NEED_IN(_n_) \
802  if ((int)(in_end - ip) < (int)(_n_)) goto input_overrun
803 
804 #define BM_NEED_OUT(_n_) \
805  if ((int)(out_end - op) < (int)(_n_)) goto output_overrun
806 
807 #define BM_ENCODE_OUTPUT(_op_, _ip_, _ip_end_) \
808  while (_ip_ < _ip_end_) { \
809  if (*_ip_ == BM_ESC) { \
810  BM_NEED_OUT(1); \
811  *_op_++ = BM_ESC; \
812  } \
813  BM_NEED_OUT(1); \
814  *_op_++ = *_ip_++; \
815  }
816 
817 /* Less comparison and earlier termination for uncompressible stuff */
818 #define BM_ENCODE_FINAL_OUTPUT(_op_, _ip_, _ip_end_) \
819  BM_NEED_OUT((_ip_end_) - (_ip_)); /* at least */ \
820  while (_ip_ < _ip_end_) { \
821  if (*_ip_ == BM_ESC) { \
822  BM_NEED_OUT((_ip_end_) - (_ip_) + 1); \
823  *_op_++ = BM_ESC; \
824  } \
825  *_op_++ = *_ip_++; \
826  }
827 
828 #define BM_ENCODE_PASS do { \
829  *out = 0; \
830  memcpy(out + 1, in, in_len); \
831  *out_len_p = in_len + 1; \
832  BM_ENCODE_STAT("no reduction"); \
833  return BMZ_E_OK; \
834 } while (0)
835 
836 #define BM_ENCODE_INT(_op_, _n_, _width_) do { \
837  UInt64 _n = _n_; \
838  switch (_width_) { \
839  case 1: \
840  *_op_++ = (Byte) _n; \
841  break; \
842  case 2: \
843  *_op_++ = (Byte)(_n >> 8); \
844  *_op_++ = (Byte)(_n); \
845  break; \
846  case 3: \
847  *_op_++ = (Byte)(_n >> 16); \
848  *_op_++ = (Byte)(_n >> 8); \
849  *_op_++ = (Byte)(_n); \
850  break; \
851  case 4: \
852  *_op_++ = (Byte)(_n >> 24); \
853  *_op_++ = (Byte)(_n >> 16); \
854  *_op_++ = (Byte)(_n >> 8); \
855  *_op_++ = (Byte)(_n); \
856  break; \
857  case 5: \
858  *_op_++ = (Byte)(_n >> 32); \
859  *_op_++ = (Byte)(_n >> 24); \
860  *_op_++ = (Byte)(_n >> 16); \
861  *_op_++ = (Byte)(_n >> 8); \
862  *_op_++ = (Byte)(_n); \
863  break; \
864  case 6: \
865  *_op_++ = (Byte)(_n >> 40); \
866  *_op_++ = (Byte)(_n >> 32); \
867  *_op_++ = (Byte)(_n >> 24); \
868  *_op_++ = (Byte)(_n >> 16); \
869  *_op_++ = (Byte)(_n >> 8); \
870  *_op_++ = (Byte)(_n); \
871  break; \
872  default: \
873  BM_DIE("%s", "256TB ought to be enough for anyone!"); \
874  } \
875 } while (0)
876 
877 #define BM_ENCODE_VINT(_op_, _n_, _width_) do { \
878  UInt64 _n = _n_; \
879  switch (_width_) { \
880  case 1: \
881  *_op_++ = (Byte)((_n) & 0x7f); \
882  break; \
883  case 2: \
884  *_op_++ = (Byte)(_n | 0x80); \
885  *_op_++ = (Byte)((_n >> 7) & 0x7f); \
886  break; \
887  case 3: \
888  *_op_++ = (Byte)(_n | 0x80); \
889  *_op_++ = (Byte)((_n >> 7) | 0x80); \
890  *_op_++ = (Byte)((_n >> 14) & 0x7f); \
891  break; \
892  case 4: \
893  *_op_++ = (Byte)(_n | 0x80); \
894  *_op_++ = (Byte)((_n >> 7) | 0x80); \
895  *_op_++ = (Byte)((_n >> 14) | 0x80); \
896  *_op_++ = (Byte)((_n >> 21) & 0x7f); \
897  break; \
898  case 5: \
899  *_op_++ = (Byte)(_n | 0x80); \
900  *_op_++ = (Byte)((_n >> 7) | 0x80); \
901  *_op_++ = (Byte)((_n >> 14) | 0x80); \
902  *_op_++ = (Byte)((_n >> 21) | 0x80); \
903  *_op_++ = (Byte)((_n >> 28) & 0x7f); \
904  break; \
905  case 6: \
906  *_op_++ = (Byte)(_n | 0x80); \
907  *_op_++ = (Byte)((_n >> 7) | 0x80); \
908  *_op_++ = (Byte)((_n >> 14) | 0x80); \
909  *_op_++ = (Byte)((_n >> 21) | 0x80); \
910  *_op_++ = (Byte)((_n >> 28) | 0x80); \
911  *_op_++ = (Byte)((_n >> 35) & 0x7f); \
912  break; \
913  case 7: \
914  *_op_++ = (Byte)(_n | 0x80); \
915  *_op_++ = (Byte)((_n >> 7) | 0x80); \
916  *_op_++ = (Byte)((_n >> 14) | 0x80); \
917  *_op_++ = (Byte)((_n >> 21) | 0x80); \
918  *_op_++ = (Byte)((_n >> 28) | 0x80); \
919  *_op_++ = (Byte)((_n >> 35) | 0x80); \
920  *_op_++ = (Byte)((_n >> 42) & 0x7f); \
921  break; \
922  default: \
923  BM_DIE("%s", "512TB ought to be enough for anyone!"); \
924  } \
925 } while (0)
926 
927 #define BM_DECODE_VINT(_n_) do { \
928  BM_NEED_IN(1); \
929  _n_ = (*ip++ & 0x7f); \
930  if (ip[-1] & 0x80) { \
931  BM_NEED_IN(1); \
932  _n_ |= ((*ip++ & 0x7f) << 7); \
933  } else break; \
934  if (ip[-1] & 0x80) { \
935  BM_NEED_IN(1); \
936  _n_ |= ((*ip++ & 0x7f) << 14); \
937  } else break; \
938  if (ip[-1] & 0x80) { \
939  BM_NEED_IN(1); \
940  _n_ |= ((*ip++ & 0x7f) << 21); \
941  } else break; \
942  if (ip[-1] & 0x80) { \
943  BM_NEED_IN(1); \
944  _n_ |= ((UInt64)(*ip++ & 0x7f) << 28); \
945  } else break; \
946  if (ip[-1] & 0x80) { \
947  BM_NEED_IN(1); \
948  _n_ |= ((UInt64)(*ip++ & 0x7f) << 35); \
949  } else break; \
950  if (ip[-1] & 0x80) { \
951  BM_NEED_IN(1); \
952  _n_ |= ((UInt64)(*ip++ & 0x7f) << 42); \
953  } \
954 } while (0)
955 
956 #define BM_INT_WIDTH(_n_) \
957  (_n_ > BM_MAX_5B ? 6 : \
958  (_n_ > BM_MAX_4B ? 5 : \
959  (_n_ > BM_MAX_3B ? 4 : \
960  (_n_ > BM_MAX_2B ? 3 : \
961  (_n_ > BM_MAX_1B ? 2 : 1)))))
962 
963 #define BM_VINT_WIDTH(_n_) \
964  (_n_ > BM_MAX_V6B ? 7 : \
965  (_n_ > BM_MAX_V5B ? 6 : \
966  (_n_ > BM_MAX_V4B ? 5 : \
967  (_n_ > BM_MAX_V3B ? 4 : \
968  (_n_ > BM_MAX_V2B ? 3 : \
969  (_n_ > BM_MAX_V1B ? 2 : 1))))))
970 
971 #define BM_ENCODE_DELTA(_op_, _ipos_, _pos_, _len_) do { \
972  UInt64 _ipos = _ipos_, _len = _len_; \
973  int pos_w = BM_INT_WIDTH(_ipos); \
974  int len_w = BM_VINT_WIDTH(_len); \
975  BM_NEED_OUT(pos_w + len_w + 1); \
976  *_op_++ = BM_ESC; \
977  BM_ENCODE_INT(_op_, _pos_, pos_w); \
978  BM_ENCODE_VINT(_op_, _len_, len_w); \
979 } while (0)
980 
981 #define BM_HANDLE_COLLISIONS(_randomize_, _init_hash_) do { \
982  size_t adapts = lut.adapts, collisions = ++lut.collisions; \
983  \
984  if (adapts > BM_COLLISION_ABORT_THRESH) { \
985  /* stumped & give up, just pass the bytes */ \
986  BM_LOG(1, "too much collisions: %llu, giving up.\n", \
987  (Llu)BM_ABORT_COLLISIONS); \
988  goto output_overrun; \
989  } \
990  else if (collisions > collision_thresh) { \
991  /* unlikely when the hash is good, so reinitialize hash, in case \
992  * data is wacked/adversarial for some reasons. \
993  */ \
994  BM_LOG(2, "hash collision %llu: %llx: pos: %llu and %llu: [" BM_COLOR_DBG, \
995  (Llu)lut.collisions, (Llu)hash, (Llu)pos, (Llu)i); \
996  BM_LOG_(2, in + pos, fp_len); \
997  BM_LOG(2, "%s", BM_COLOR_END "] vs [" BM_COLOR_ALT); \
998  BM_LOG_(2, in + i, fp_len); \
999  BM_LOG(2, "%s", BM_COLOR_END "] trying to adapt.\n"); \
1000  _randomize_; \
1001  boundary = last_i = i = offset; \
1002  _init_hash_; \
1003  init_bmlut(&lut, in_len / fp_len, work_mem); \
1004  lut.adapts = adapts + 1; \
1005  op = out + 1; \
1006  last_ip = in; \
1007  continue; \
1008  } \
1009 } while (0)
1010 
1011 #define BM_ENCODE_STAT(_msg_) do { \
1012  size_t c = lut.collisions; \
1013  BM_LOG(1, "%s: in: %llu, out: %llu, lut.collisions: %llu (eff. bits: %.3f) " \
1014  "thresh: %llu), lut.probes: %llu (%.3f/lu)\n", _msg_, (Llu)in_len, \
1015  (Llu)*out_len_p, (Llu)c, \
1016  c ? log((double)nfps * scan_len / c) / log(2) : sizeof(size_t) * 8., \
1017  (Llu)collision_thresh, (Llu)lut.probes, \
1018  (double)lut.probes / scan_len); \
1019 } while (0)
1020 
1021 /* The main algorithm */
1022 #define BM_ENCODE_BODY(_init_hash_, _update_hash_, _randomize_) \
1023  size_t i = offset, last_i = i, end_i = in_len - fp_len; \
1024  size_t boundary = i, scan_len = end_i - offset; \
1025  size_t nfps, hash, pos, collision_thresh; \
1026  Byte *in = (Byte *) src, *out = (Byte *) dst; \
1027  Byte *ip = in, *last_ip = ip, *in_end = in + in_len; \
1028  Byte *op = out, *out_end; \
1029  BmLut lut = { 0, 0, 0, 0, NULL }; \
1030  BmLutEntry *entry = NULL; \
1031  \
1032  BM_CHECK(fp_len > 0); \
1033  nfps = in_len / fp_len; \
1034  collision_thresh = s_collision_thresh ? s_collision_thresh \
1035  : scan_len / 1e9 * nfps + 8; \
1036  if (in_len <= offset + fp_len) BM_ENCODE_PASS; \
1037  \
1038  init_bmlut(&lut, nfps, work_mem); \
1039  _init_hash_; \
1040  \
1041  *op++ = 1; /* default */ \
1042  out_end = op + (*out_len_p < in_len ? *out_len_p : in_len); \
1043  \
1044  for (; i <= end_i; ++i) { \
1045  if (i > last_i) { \
1046  BM_LOOKUP(lut, hash, entry); \
1047  pos = entry->pos; \
1048  \
1049  if (pos != BM_NPOS) { \
1050  Byte *ip0 = in + pos, *cp = in + i, *cp0 = cp; \
1051  Byte *ip1 = ip0 + fp_len, *cp1 = cp0 + fp_len; \
1052  \
1053  if (memcmp(ip0, cp0, fp_len) == 0) { \
1054  /* we have a match */ \
1055  size_t mlen = 0; \
1056  /* greedy backward up to fp_len - 1 */ \
1057  for (ip = ip0; ip > in && cp > last_ip && \
1058  mlen < fp_len - 1 && *--ip == *--cp; ++mlen); \
1059  cp = cp0 - mlen; \
1060  BM_ENCODE_OUTPUT(op, last_ip, cp); \
1061  pos = ip0 - mlen - in; \
1062  mlen += fp_len; \
1063  /* greedy forward as much as possible */ \
1064  for (ip = ip1, cp = cp1; cp < in_end && *ip++ == *cp++; ++mlen); \
1065  BM_ENCODE_DELTA(op, last_ip - in, pos, mlen); \
1066  last_ip = cp0 - (ip0 - (in + pos)) + mlen; \
1067  if (last_ip == in_end) break; \
1068  last_i = last_ip - in; \
1069  } \
1070  else BM_HANDLE_COLLISIONS(_randomize_, _init_hash_); \
1071  } \
1072  } \
1073  if ((i - offset) == boundary) { \
1074  if (i <= last_i) BM_LOOKUP(lut, hash, entry); \
1075  /* insert hash */ \
1076  entry->hash = hash; \
1077  entry->pos = i; \
1078  boundary += fp_len; \
1079  } \
1080  if (i < end_i) hash = _update_hash_; /* for next iter */ \
1081  } \
1082  /* copy the remaining bytes */ \
1083  BM_ENCODE_FINAL_OUTPUT(op, last_ip, in_end); \
1084  *out_len_p = op - out; \
1085  BM_ENCODE_STAT("success"); \
1086  \
1087  return BMZ_E_OK; \
1088  \
1089 output_overrun: \
1090  if (*out_len_p > in_len) BM_ENCODE_PASS; \
1091  BM_ENCODE_STAT("tip: make output buffer at least input length + 1"); \
1092  return BMZ_E_OUTPUT_OVERRUN
1093 
1094 /* External APIs for bm encoding */
1095 int
1096 bmz_bm_pack_mod(const void *src, size_t in_len, void *dst, size_t *out_len_p,
1097  size_t offset, size_t fp_len, void *work_mem,
1098  size_t b, size_t m) {
1099  size_t pow_n;
1100  BM_ENCODE_BODY(hash = r_hash_mod(in + i, fp_len, b, m);
1101  pow_n = pow_mod(b, fp_len, m),
1102  update_hash_mod(hash, in[i + fp_len], in[i], pow_n, b, m),
1103  b = random_prime(0));
1104 }
1105 
1106 int
1107 bmz_bm_pack_mod16x2(const void *src, size_t in_len, void *dst,
1108  size_t *out_len_p, size_t offset, size_t fp_len,
1109  void *work_mem, size_t b1, size_t b2,
1110  size_t m1, size_t m2) {
1111  size_t pow_n_b1, pow_n_b2;
1112  BM_ENCODE_BODY(hash = r_hash_mod16x2(in + i, fp_len, b1, b2, m1, m2);
1113  pow_n_b1 = pow_mod32(b1, fp_len, m1);
1114  pow_n_b2 = pow_mod32(b2, fp_len, m2),
1115  update_hash_mod16x2(hash, in[i + fp_len], in[i],
1116  pow_n_b1, pow_n_b2, b1, b2, m1, m2),
1117  b1 = random_prime(0);
1118  b2 = random_prime(b1));
1119 }
1120 
1121 int
1122 bmz_bm_pack_mask16x2(const void *src, size_t in_len, void *dst,
1123  size_t *out_len_p, size_t offset, size_t fp_len,
1124  void *work_mem, size_t b1, size_t b2) {
1125  size_t pow_n_b1, pow_n_b2;
1126  BM_ENCODE_BODY(hash = r_hash_mask16x2(in + i, fp_len, b1, b2);
1127  pow_n_b1 = pow_mask32(b1, fp_len, BM_MASK16);
1128  pow_n_b2 = pow_mask32(b2, fp_len, BM_MASK16),
1129  update_hash_mask16x2(hash, in[i + fp_len], in[i],
1130  pow_n_b1, pow_n_b2, b1, b2),
1131  b1 = random_prime(0);
1132  b2 = random_prime(b1));
1133 }
1134 
1135 int
1136 bmz_bm_pack_mask(const void *src, size_t in_len, void *dst, size_t *out_len_p,
1137  size_t offset, size_t fp_len, void *work_mem, size_t b) {
1138  size_t pow_n;
1139  BM_ENCODE_BODY(hash = r_hash_mask(in + i, fp_len, b, BM_MASKSZ);
1140  pow_n = pow_mask(b, fp_len, BM_MASKSZ),
1141  update_hash_mask(hash, in[i + fp_len], in[i],
1142  pow_n, b, BM_MASKSZ),
1143  b = random_prime(0));
1144 }
1145 
1146 int
1147 bmz_bm_pack_mask32x2(const void *src, size_t in_len, void *dst,
1148  size_t *out_len_p, size_t offset, size_t fp_len,
1149  void *work_mem, size_t b1, size_t b2) {
1150  size_t pow_n_b1, pow_n_b2;
1151  BM_ENCODE_BODY(hash = r_hash_mask32x2(in + i, fp_len, b1, b2);
1152  pow_n_b1 = pow_mask(b1, fp_len, BM_MASK32);
1153  pow_n_b2 = pow_mask(b2, fp_len, BM_MASK32),
1154  update_hash_mask32x2(hash, in[i + fp_len], in[i],
1155  pow_n_b1, pow_n_b2, b1, b2),
1156  b1 = random_prime(0);
1157  b2 = random_prime(b1));
1158 }
1159 
1160 /* Max lz overhead for incompressible block */
1161 #define BM_LZ_MAX(_len_) ((_len_) / 16 + 64 + 3)
1162 
1163 size_t
1164 bmz_pack_buflen(size_t in_len) {
1165  return in_len + BM_LZ_MAX(in_len) + 1;
1166 }
1167 
1168 
1169 
1170 /* Workmem (lut) for bm pack */
1171 size_t
1172 bmz_bm_pack_worklen(size_t in_len, size_t fp_len) {
1173  return bm_lut_size(in_len / fp_len) * sizeof(BmLutEntry);
1174 }
1175 
1176 /* Size for compressor auxiliary memory */
1177 static size_t
1178 bmz_pack_auxlen(size_t in_len, size_t fp_len) {
1179  size_t lz_worklen = bmz_lz_pack_worklen(in_len) + 16;
1180  size_t bm_worklen = bmz_bm_pack_worklen(in_len, fp_len) + 16;
1181  return bm_worklen > lz_worklen ? bm_worklen : lz_worklen;
1182 }
1183 
1184 /* Including temporary space for bm output as well */
1185 size_t
1186 bmz_pack_worklen(size_t in_len, size_t fp_len) {
1187  BM_CHECK(fp_len > 0);
1188  return in_len + bmz_pack_auxlen(in_len, fp_len);
1189 }
1190 
1191 size_t
1192 bmz_unpack_worklen(size_t out_len) {
1193  return out_len + 1;
1194 }
1195 
1196 #define BMZ_PACK_BODY(_bm_pack_) \
1197  size_t tlen = in_len + 1; \
1198  Byte *dst = (Byte *)work_mem; \
1199  Byte *aux_mem = dst + tlen; \
1200  Byte *work_mem_aligned = BM_ALIGN(aux_mem, 8); \
1201  int ret; \
1202  \
1203  /* overlap flag assume the following memory layout \
1204  * |=============== input/output =================| \
1205  * |<------------- bmz_pack_buflen -------------->| \
1206  * |<---------------- in_len ---------------->| \
1207  * |<-- *out_len_p -->| \
1208  */ \
1209  ret = _bm_pack_; \
1210  if (ret != BMZ_E_OK) return ret; \
1211  return bmz_lz_pack(dst, tlen, out, out_len_p, work_mem_aligned)
1212 
1213 int
1214 bmz_pack_mod(const void *in, size_t in_len, void *out, size_t *out_len_p,
1215  size_t offset, size_t fp_len, unsigned flags, void *work_mem,
1216  size_t b, size_t m) {
1217  BMZ_PACK_BODY(bmz_bm_pack_mod(in, in_len, dst, &tlen,
1218  offset, fp_len, work_mem_aligned, b, m));
1219 }
1220 
1221 int
1222 bmz_pack_mod16x2(const void *in, size_t in_len, void *out, size_t *out_len_p,
1223  size_t offset, size_t fp_len, unsigned flags, void *work_mem,
1224  size_t b1, size_t b2, size_t m1, size_t m2) {
1225  BMZ_PACK_BODY(bmz_bm_pack_mod16x2(in, in_len, dst, &tlen,
1226  offset, fp_len, work_mem_aligned, b1, b2, m1, m2));
1227 }
1228 
1229 int
1230 bmz_pack_mask16x2(const void *in, size_t in_len, void *out, size_t *out_len_p,
1231  size_t offset, size_t fp_len, unsigned flags, void *work_mem,
1232  size_t b1, size_t b2) {
1233  BMZ_PACK_BODY(bmz_bm_pack_mask16x2(in, in_len, dst, &tlen,
1234  offset, fp_len, work_mem_aligned, b1, b2));
1235 }
1236 
1237 int
1238 bmz_pack_mask(const void *in, size_t in_len, void *out, size_t *out_len_p,
1239  size_t offset, size_t fp_len, unsigned flags, void *work_mem,
1240  size_t b) {
1241  BMZ_PACK_BODY(bmz_bm_pack_mask(in, in_len, dst, &tlen,
1242  offset, fp_len, work_mem_aligned, b));
1243 }
1244 
1245 int
1246 bmz_pack_mask32x2(const void *in, size_t in_len, void *out, size_t *out_len_p,
1247  size_t offset, size_t fp_len, unsigned flags, void *work_mem,
1248  size_t b1, size_t b2) {
1249  BMZ_PACK_BODY(bmz_bm_pack_mask32x2(in, in_len, dst, &tlen,
1250  offset, fp_len, work_mem_aligned, b1, b2));
1251 }
1252 
1253 int
1254 bmz_pack(const void *in, size_t in_len, void *out, size_t *out_len_p,
1255  size_t offset, size_t fp_len, unsigned flags, void *work_mem) {
1256  switch (flags >> 24) {
1257  case BMZ_HASH_MOD:
1258  return bmz_pack_mod(in, in_len, out, out_len_p, offset, fp_len, flags,
1259  work_mem, BM_B1, BM_M1);
1260  case BMZ_HASH_MOD16X2:
1261  return bmz_pack_mod16x2(in, in_len, out, out_len_p, offset, fp_len, flags,
1262  work_mem, BM_B1, BM_B2, BM_M1, BM_M2);
1263  case BMZ_HASH_MASK16X2:
1264  return bmz_pack_mask16x2(in, in_len, out, out_len_p, offset, fp_len, flags,
1265  work_mem, BM_B1, BM_B2);
1266  case 0: /* default */
1267  case BMZ_HASH_MASK:
1268  return bmz_pack_mask(in, in_len, out, out_len_p, offset, fp_len, flags,
1269  work_mem, BM_B1);
1270  case BMZ_HASH_MASK32X2:
1271  return bmz_pack_mask32x2(in, in_len, out, out_len_p, offset, fp_len, flags,
1272  work_mem, BM_B1, BM_B2);
1273  default:
1274  BM_DIE("unknown hash algorithm: %u", (flags >> 24));
1275  }
1276  return BMZ_E_ERROR;
1277 }
1278 
1279 int
1280 bmz_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p,
1281  void *work_mem) {
1282  Byte *dst = (Byte *)work_mem;
1283  size_t tlen = *out_len_p + 1;
1284  int ret;
1285 
1286  ret = bmz_lz_unpack((Byte *)in, in_len, dst, &tlen);
1287  if (ret != BMZ_E_OK) return ret;
1288  return bmz_bm_unpack(dst, tlen, out, out_len_p);
1289 }
1290 
1291 
1292 #define BM_DECODE_POS(_cpos_, _n_) do { \
1293  UInt64 _ipos = _cpos_; \
1294  int w = BM_INT_WIDTH(_ipos); \
1295  BM_NEED_IN(w); \
1296  \
1297  switch (w) { \
1298  case 1: \
1299  _n_ = *ip++; \
1300  break; \
1301  case 2: \
1302  _n_ = (*ip++ << 8); \
1303  _n_ |= *ip++; \
1304  break; \
1305  case 3: \
1306  _n_ = (*ip++ << 16); \
1307  _n_ |= (*ip++ << 8); \
1308  _n_ |= *ip++; \
1309  break; \
1310  case 4: \
1311  _n_ = (*ip++ << 24); \
1312  _n_ |= (*ip++ << 16); \
1313  _n_ |= (*ip++ << 8); \
1314  _n_ |= *ip++; \
1315  break; \
1316  case 5: \
1317  _n_ = ((UInt64)*ip++ << 32); \
1318  _n_ |= (*ip++ << 24); \
1319  _n_ |= (*ip++ << 16); \
1320  _n_ |= (*ip++ << 8); \
1321  _n_ |= *ip++; \
1322  break; \
1323  case 6: \
1324  _n_ = ((UInt64)*ip++ << 40); \
1325  _n_ |= ((UInt64)*ip++ << 32); \
1326  _n_ |= (*ip++ << 24); \
1327  _n_ |= (*ip++ << 16); \
1328  _n_ |= (*ip++ << 8); \
1329  _n_ |= *ip++; \
1330  default: \
1331  BM_DIE("%s", "256TB ought to be enough for anyone!"); \
1332  } \
1333 } while (0)
1334 
1335 #define BM_DECODE_LEN(_n_) BM_DECODE_VINT(_n_)
1336 
1337 int
1338 bmz_bm_unpack(const void *src, size_t in_len, void *dst, size_t *out_len_p) {
1339  Byte *in = (Byte *)src, *out = (Byte *)dst;
1340  Byte *ip = in, *last_ip = ip + 1, *op = out, *cp;
1341  Byte *in_end = ip + in_len, *out_end = op + *out_len_p;
1342  int remains;
1343 
1344  if (*ip++ == 0) {
1345  memcpy(out, ip, --in_len);
1346  *out_len_p = in_len;
1347  return BMZ_E_OK;
1348  }
1349 
1350  while (ip < in_end) {
1351  if (*ip == BM_ESC) {
1352  UInt64 len = ip - last_ip, pos = 0;
1353 
1354  if (len) {
1355  BM_NEED_OUT(len);
1356  memcpy(op, last_ip, len);
1357  op += len;
1358  last_ip = ip;
1359  }
1360  BM_NEED_IN(1);
1361 
1362  if (ip[1] == BM_ESC) {
1363  ++last_ip;
1364  ip += 2;
1365  }
1366  else {
1367  ++ip;
1368  BM_DECODE_POS(op - out, pos);
1369  BM_DECODE_LEN(len);
1370  BM_NEED_OUT(len);
1371  last_ip = ip;
1372  cp = out + pos;
1373 
1374  while (len--) *op++ = *cp++;
1375  }
1376  }
1377  else ++ip;
1378  }
1379  remains = in_end - last_ip;
1380  BM_NEED_OUT(remains);
1381  memcpy(op, last_ip, remains);
1382  *out_len_p = op - out + remains;
1383 
1384  return BMZ_E_OK;
1385 
1386 input_overrun:
1387  *out_len_p = op - out;
1388  return BMZ_E_INPUT_OVERRUN;
1389 
1390 output_overrun:
1391  *out_len_p = op - out;
1392  return BMZ_E_OUTPUT_OVERRUN;
1393 }
1394 
1395 int
1396 bmz_bm_dump(const void *src, size_t in_len) {
1397  Byte *in = (Byte *)src, *ip = in, *in_end = ip + in_len;
1398  UInt64 pos = 0, len, cpos = 0, tot_pte_sz = 0, tot_ptr_sz = 0, nptrs = 0;
1399  int is_on = *ip++;
1400 
1401  printf(BM_COLOR_DBG "%d" BM_COLOR_END, is_on);
1402 
1403  if (!is_on) {
1404  fwrite(ip, 1, in_end - ip, stdout);
1405  return BMZ_E_OK;
1406  }
1407 
1408  while (ip < in_end) {
1409  if (*ip == BM_ESC) {
1410  Byte *ip0 = ip;
1411  BM_NEED_IN(1);
1412 
1413  if (ip[1] != BM_ESC) {
1414  ++ip;
1415  BM_DECODE_POS(cpos, pos);
1416  BM_DECODE_LEN(len);
1417  printf(BM_COLOR_DBG "<%llu,%llu>" BM_COLOR_END,
1418  (unsigned long long)pos, (unsigned long long)len);
1419  cpos += len;
1420  tot_pte_sz += len;
1421  tot_ptr_sz += ip - ip0;
1422  ++nptrs;
1423  }
1424  else {
1425  putchar(*ip++);
1426  putchar(*ip++);
1427  cpos += 2;
1428  }
1429  }
1430  else {
1431  putchar(*ip++);
1432  ++cpos;
1433  }
1434  }
1435  BM_LOG(1, "%llu pointers, avg pointee size: %.3f, avg pointer size: %.3f\n",
1436  (unsigned long long)nptrs, (double)tot_pte_sz / nptrs,
1437  (double)tot_ptr_sz / nptrs);
1438  return BMZ_E_OK;
1439 
1440 input_overrun:
1441  return BMZ_E_INPUT_OVERRUN;
1442 }
1443 
1444 int
1446  int ret = lzo_init();
1447  return ret == LZO_E_OK ? BMZ_E_OK : BMZ_E_ERROR;
1448 }
1449 
1450 int
1451 bmz_lz_pack(const void *in, size_t in_len, void *out, size_t *out_len_p,
1452  void *work_mem) {
1453  lzo_uint olen = *out_len_p;
1454  int ret = lzo1x_1_compress((Byte *)in, in_len, (Byte *)out, &olen, work_mem);
1455  *out_len_p = olen;
1456  return ret;
1457 }
1458 
1459 size_t
1460 bmz_lz_pack_worklen(size_t in_len) {
1461  return LZO1X_MEM_COMPRESS;
1462 }
1463 
1464 int
1465 bmz_lz_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p) {
1466  lzo_uint olen = *out_len_p;
1467  int ret = lzo1x_decompress((Byte *)in, in_len, (Byte *)out, &olen, NULL);
1468  *out_len_p = olen;
1469  return ret;
1470 }
1471 
1472 unsigned
1473 bmz_checksum(const void *in, size_t in_len) {
1474  return lzo_adler32(1, (Byte *)in, in_len);
1475 }
1476 
1477 unsigned
1478 bmz_update_checksum(unsigned s, const void *in, size_t in_len) {
1479  return lzo_adler32(s, (Byte *)in, in_len);
1480 }
1481 
1482 /* vim: et sw=2
1483  */
int bmz_pack(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem)
Perform bmz compression.
Definition: bmz.c:1254
int bmz_bm_pack_mod(const void *src, size_t in_len, void *dst, size_t *out_len_p, size_t offset, size_t fp_len, void *work_mem, size_t b, size_t m)
Definition: bmz.c:1096
static void builtin_out(const char *buf, size_t len)
Definition: bmz.c:210
#define BM_CHECK(_cond_)
Definition: bmz.c:241
int bmz_bm_pack_mask(const void *src, size_t in_len, void *dst, size_t *out_len_p, size_t offset, size_t fp_len, void *work_mem, size_t b)
Definition: bmz.c:1136
static UInt32 pow_mask32(UInt32 x, UInt32 n, UInt32 m)
Definition: bmz.c:477
#define BM_HASH_BENCH_BODY(_init_hash_, _update_hash_)
Definition: bmz.c:560
static void init_bmlut(BmLut *table, size_t n, void *work_mem)
Definition: bmz.c:320
size_t adapts
Definition: bmz.c:315
unsigned bmz_update_checksum(unsigned s, const void *in, size_t in_len)
Definition: bmz.c:1478
#define BM_MASK32
Definition: bmz.c:50
int bmz_set_collision_thresh(int thresh)
Definition: bmz.c:260
int bmz_bm_pack_mask32x2(const void *src, size_t in_len, void *dst, size_t *out_len_p, size_t offset, size_t fp_len, void *work_mem, size_t b1, size_t b2)
Definition: bmz.c:1147
static UInt32 r_hash_mask16x2(const Byte *buf, size_t len, UInt32 b1, UInt32 b2)
Definition: bmz.c:450
static UInt64 pow_mod(UInt64 x, UInt64 n, UInt64 m)
Definition: bmz.c:390
int64_t Int64
Definition: bmz.c:86
void bmz_bench_lut_mod(const void *src, size_t in_len, size_t fp_len, void *work_mem, size_t b, size_t m)
Definition: bmz.c:741
Definition: bmz.c:314
int bmz_check_hash_mask32x2(const void *src, size_t in_len, size_t fp_len, size_t b1, size_t b2)
Definition: bmz.c:662
static Int64 update_hash_mod(Int64 h, Byte in, Byte out, Int64 pow_n, Int64 b, Int64 m)
Definition: bmz.c:408
int bmz_bm_pack_mod16x2(const void *src, size_t in_len, void *dst, size_t *out_len_p, size_t offset, size_t fp_len, void *work_mem, size_t b1, size_t b2, size_t m1, size_t m2)
Definition: bmz.c:1107
#define HT_NORETURN
Definition: compat-c.h:60
static BmzOutProc s_out_proc
Definition: bmz.c:220
size_t bmz_bm_pack_worklen(size_t in_len, size_t fp_len)
Definition: bmz.c:1172
#define BMZ_E_OUTPUT_OVERRUN
Definition: bmz.h:35
#define BM_NEED_OUT(_n_)
Definition: bmz.c:804
#define BM_HASH_CHECK_BODY(_init_hash_, _rehash_, _update_hash_)
Definition: bmz.c:540
static size_t bmz_pack_auxlen(size_t in_len, size_t fp_len)
Definition: bmz.c:1178
static UInt64 r_hash_mask(const Byte *buf, size_t len, UInt64 b, UInt64 m)
Definition: bmz.c:439
#define R_HASH_MASK_BODY(int_type)
Definition: bmz.c:427
uint32_t UInt32
Definition: bmz.c:84
size_t bmz_hash_mod(const void *in, size_t in_len, size_t b, size_t m)
Definition: bmz.c:514
#define BMZ_HASH_MASK
Definition: bmz-internal.h:31
uint64_t UInt64
Definition: bmz.c:85
static UInt64 r_hash_mask32x2(const Byte *buf, size_t len, UInt64 b1, UInt64 b2)
Definition: bmz.c:456
static int s_verbosity
Definition: bmz.c:207
#define BM_ENCODE_BODY(_init_hash_, _update_hash_, _randomize_)
Definition: bmz.c:1022
size_t bmz_lz_pack_worklen(size_t in_len)
Definition: bmz.c:1460
#define BM_LZ_MAX(_len_)
Definition: bmz.c:1161
int bmz_pack_mask32x2(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem, size_t b1, size_t b2)
Definition: bmz.c:1246
size_t bmz_hash_mask(const void *in, size_t in_len, size_t b)
Definition: bmz.c:530
int bmz_set_verbosity(int verbosity)
Set the verbosity of library for testing and debugging.
Definition: bmz.c:246
static UInt32 update_hash_mask16x2(UInt32 h, int in, int out, UInt32 pow1, UInt32 pow2, UInt32 b1, UInt32 b2)
Definition: bmz.c:499
int bmz_pack_mask16x2(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem, size_t b1, size_t b2)
Definition: bmz.c:1230
int bmz_check_hash_mod16x2(const void *src, size_t in_len, size_t fp_len, size_t b1, size_t b2, size_t m1, size_t m2)
Definition: bmz.c:594
static size_t random_prime(size_t excluded)
Definition: bmz.c:270
void bmz_bench_hash_mod(const void *src, size_t in_len, size_t fp_len)
Definition: bmz.c:584
static Int32 update_hash_mod32(Int32 h, Byte in, Byte out, Int32 pow_n, Int32 b, Int32 m)
Definition: bmz.c:413
size_t pos
Definition: bmz.c:311
static UInt32 r_hash_mod32(const Byte *buf, size_t len, UInt32 b, UInt32 m)
Definition: bmz.c:366
size_t bmz_hash_mask16x2(const void *in, size_t in_len, size_t b1, size_t b2)
Definition: bmz.c:525
static UInt64 r_hash_mod(const Byte *buf, size_t len, size_t b, UInt64 m)
Definition: bmz.c:361
int32_t Int32
Definition: bmz.c:87
#define BM_M2
Definition: bmz.c:46
static UInt64 pow_mask(UInt64 x, UInt64 n, UInt64 m)
Definition: bmz.c:472
#define BM_ESC
Definition: bmz.c:56
#define UPDATE_HASH_MASK_BODY
Definition: bmz.c:481
static UInt64 update_hash_mask32x2(UInt64 h, int in, int out, UInt64 pow1, UInt64 pow2, UInt64 b1, UInt64 b2)
Definition: bmz.c:506
#define BM_NPOS
Definition: bmz.c:93
int bmz_lz_pack(const void *in, size_t in_len, void *out, size_t *out_len_p, void *work_mem)
Definition: bmz.c:1451
Definition: bmz.c:309
#define BM_B2
Definition: bmz.c:44
#define BMZ_HASH_MASK16X2
Definition: bmz-internal.h:30
long long unsigned Llu
Definition: bmz.c:90
static size_t s_primes[]
Definition: bmz.c:99
BmLutEntry * entries
Definition: bmz.c:316
void(* BmzDieProc)(const char *msg) HT_NORETURN
Signature of the fatal procedure.
Definition: bmz.h:121
size_t bmz_hash_mask32x2(const void *in, size_t in_len, size_t b1, size_t b2)
Definition: bmz.c:535
int bmz_check_hash_mod(const void *src, size_t in_len, size_t fp_len, size_t b, size_t m)
Definition: bmz.c:573
uint8_t Byte
Definition: bmz.c:83
#define BMZ_E_INPUT_OVERRUN
Definition: bmz.h:34
int bmz_init()
Perform bmz initialization only needs to be called once, mostly for sanity checks.
Definition: bmz.c:1445
void bmz_bench_hash_mask16x2(const void *src, size_t in_len, size_t fp_len)
Definition: bmz.c:631
void bmz_bench_lut_mask16x2(const void *src, size_t in_len, size_t fp_len, void *work_mem, size_t b1, size_t b2)
Definition: bmz.c:764
#define BMZ_HASH_MOD16X2
Definition: bmz-internal.h:29
#define BM_COLOR_END
Definition: bmz.c:80
int bmz_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p, void *work_mem)
Perform bmz decompression.
Definition: bmz.c:1280
static UInt32 r_hash_mod16x2(const Byte *buf, size_t len, UInt32 b1, UInt32 b2, UInt32 m1, UInt32 m2)
Definition: bmz.c:372
int bmz_check_hash_mask(const void *src, size_t in_len, size_t fp_len, size_t b)
Definition: bmz.c:642
void bmz_bench_lut_mask(const void *src, size_t in_len, size_t fp_len, void *work_mem, size_t b)
Definition: bmz.c:776
static UInt32 update_hash_mod16x2(UInt32 h, int in, int out, UInt32 pow1, UInt32 pow2, UInt32 b1, UInt32 b2, UInt32 m1, UInt32 m2)
Definition: bmz.c:418
#define BMZ_E_OK
Definition: bmz.h:32
#define POW_MASK_BODY(_pow_fun_, _int_type_)
Definition: bmz.c:461
#define BM_DECODE_LEN(_n_)
Definition: bmz.c:1335
size_t bmz_pack_buflen(size_t in_len)
Compute bmz compression output buffer length.
Definition: bmz.c:1164
size_t hash
Definition: bmz.c:310
#define POW_MOD_BODY(_pow_fun_, _int_type_)
Definition: bmz.c:379
size_t bmz_unpack_worklen(size_t out_len)
Return size of work memory for bmz decompression.
Definition: bmz.c:1192
BmzOutProc bmz_set_out_proc(BmzOutProc proc)
Set messaging/logging procedure.
Definition: bmz.c:253
#define BM_DIE(_fmt_,...)
Definition: bmz.c:234
static HT_NORETURN void builtin_die(const char *msg)
Definition: bmz.c:215
int bmz_pack_mod(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem, size_t b, size_t m)
Definition: bmz.c:1214
int bmz_lz_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p)
Definition: bmz.c:1465
static UInt32 pow_mod32(UInt32 x, UInt32 n, UInt32 m)
Definition: bmz.c:395
void bmz_bench_lut_mask32x2(const void *src, size_t in_len, size_t fp_len, void *work_mem, size_t b1, size_t b2)
Definition: bmz.c:786
int bmz_bm_pack_mask16x2(const void *src, size_t in_len, void *dst, size_t *out_len_p, size_t offset, size_t fp_len, void *work_mem, size_t b1, size_t b2)
Definition: bmz.c:1122
#define BMZ_PACK_BODY(_bm_pack_)
Definition: bmz.c:1196
void bmz_bench_hash_mod16x2(const void *src, size_t in_len, size_t fp_len)
Definition: bmz.c:607
#define BM_B1
Definition: bmz.c:43
#define BM_LOG(_level_, _fmt_,...)
Definition: bmz.c:223
#define BM_NEED_IN(_n_)
Definition: bmz.c:801
#define R_HASH_MOD_BODY(_int_type_)
Definition: bmz.c:349
#define UPDATE_HASH_MOD_BODY
Definition: bmz.c:400
size_t probes
Definition: bmz.c:315
#define BM_MASKSZ
Definition: bmz.c:51
int bmz_pack_mod16x2(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem, size_t b1, size_t b2, size_t m1, size_t m2)
Definition: bmz.c:1222
#define BM_DECODE_POS(_cpos_, _n_)
Definition: bmz.c:1292
size_t size
Definition: bmz.c:315
static UInt32 update_hash_mask32(UInt32 h, int in, int out, UInt32 pow_n, UInt32 b, UInt32 m)
Definition: bmz.c:493
#define BMZ_HASH_MASK32X2
Definition: bmz-internal.h:32
void bmz_bench_lut_mod16x2(const void *src, size_t in_len, size_t fp_len, void *work_mem, size_t b1, size_t b2, size_t m1, size_t m2)
Definition: bmz.c:751
static UInt64 update_hash_mask(UInt64 h, int in, int out, UInt64 pow_n, UInt64 b, UInt64 m)
Definition: bmz.c:488
#define BM_M1
Definition: bmz.c:45
#define BM_MASK16
Definition: bmz.c:48
unsigned bmz_checksum(const void *in, size_t in_len)
A fast checksum (adler32) function that might be useful.
Definition: bmz.c:1473
#define BM_COLOR_DBG
Definition: bmz.c:78
int bmz_bm_unpack(const void *src, size_t in_len, void *dst, size_t *out_len_p)
Definition: bmz.c:1338
void(* BmzOutProc)(const char *msg, size_t len)
Signature of the messaging/logging procedure.
Definition: bmz.h:117
int bmz_pack_mask(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem, size_t b)
Definition: bmz.c:1238
#define BMZ_HASH_MOD
Definition: bmz-internal.h:28
int bmz_bm_dump(const void *src, size_t in_len)
Definition: bmz.c:1396
void bmz_bench_hash(const void *in, size_t in_len, unsigned type)
Definition: bmz.c:686
void bmz_bench_hash_mask(const void *src, size_t in_len, size_t fp_len)
Definition: bmz.c:653
static BmzDieProc s_die_proc
Definition: bmz.c:221
size_t collisions
Definition: bmz.c:315
#define BM_LUT_BENCH_BODY(_init_hash_, _update_hash_)
Definition: bmz.c:708
static size_t bm_lut_size(size_t n)
Definition: bmz.c:293
static UInt32 r_hash_mask32(const Byte *buf, size_t len, UInt32 b, UInt32 m)
Definition: bmz.c:444
void bmz_bench_hash_mask32x2(const void *src, size_t in_len, size_t fp_len)
Definition: bmz.c:675
size_t bmz_hash_mod16x2(const void *in, size_t in_len, size_t b1, size_t b2, size_t m1, size_t m2)
Definition: bmz.c:519
size_t bmz_pack_worklen(size_t in_len, size_t fp_len)
Return size of work memory for bmz compression.
Definition: bmz.c:1186
int bmz_check_hash_mask16x2(const void *src, size_t in_len, size_t fp_len, size_t b1, size_t b2)
Definition: bmz.c:618
#define BMZ_E_ERROR
Definition: bmz.h:33
static size_t s_collision_thresh
Definition: bmz.c:204