ravi/mir/mir-reduce.h

/* This file is a part of MIR project.
   Copyright (C) 2018-2021 Vladimir Makarov <vmakarov.gcc@gmail.com>.
*/

#ifndef MIR_REDUCE_H
#define MIR_REDUCE_H

/* Data compression.  Major goals are simplicity, fast decompression
   speed, moderate compression speed.  The algorithm is tuned for
   binary MIR compression and close to LZ4.  Only we use a bit
   different format and offsets in symbol numbers instead of just
   offsets.

   A better compression (on par with LZ4) could be achieved by adding
   elements for all positions (now positions inside referenced symbols
   are excluded) or/and increasing the buffer or/and increasing the
   table.  But it would slow down the compression or/and increase the
   used memory.

   Functions reduce_encode, reduce_decode, reduce_encode_start,
   reduce_encode_put, reduce_encode_finish, reduce_decode_start,
   reduce_decode_get, reduce_decode_finish are the only interface
   functions.

   Format of compressed data: "MIR", elements*, zero byte, 8-byte check hash in little endian form
   Format of element:
    o 8 bits tag
      (N bits for symbol length; 0 means no sym, 2^N -1 means symbol length as uint present;
      (8-N) bits for reference length; 0 means no ref, 2^(8-N) - 1 means length as uint present)
    o [uint for symbol lenght]*, symbol string,
    o [uint for ref len]*, symbol number as uint */

#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "mir-hash.h"

#define FALSE 0
#define TRUE 1
#define _REDUCE_DATA_PREFIX "MIR" /* first chars of compressed data */
#define _REDUCE_SYMB_TAG_LEN 3    /* for some application could be 4 */
#define _REDUCE_SYMB_TAG_LONG ((1 << _REDUCE_SYMB_TAG_LEN) - 1) /* should be not changed */
#define _REDUCE_REF_TAG_LEN (8 - _REDUCE_SYMB_TAG_LEN)
#define _REDUCE_REF_TAG_LONG ((1 << _REDUCE_REF_TAG_LEN) - 1) /* should be not changed */
#define _REDUCE_START_LEN 4                                   /* Should be at least 4 */
#define _REDUCE_BUF_LEN (1 << 18)
/* The following should be power of two. There will be no space saving if it is less than 1/4 of buf
   length. */
#define _REDUCE_TABLE_SIZE (_REDUCE_BUF_LEN / 4)
#define _REDUCE_MAX_SYMB_LEN (2047)

typedef size_t (*reduce_reader_t) (void *start, size_t len, void *aux_data);
typedef size_t (*reduce_writer_t) (const void *start, size_t len, void *aux_data);

struct _reduce_el {
  uint32_t pos, num, next, head;
};

struct _reduce_encode_data {
  reduce_writer_t writer;
  uint32_t el_free;
  uint32_t curr_symb_len;
  uint8_t curr_symb[_REDUCE_MAX_SYMB_LEN];
  struct _reduce_el table[_REDUCE_TABLE_SIZE]; /* hash -> el */
};

struct _reduce_decode_data {
  uint8_t eof_p;
  uint32_t buf_get_pos;
  reduce_reader_t reader;
  uint32_t ind2pos[_REDUCE_BUF_LEN];
};

struct reduce_data {
  union {
    struct _reduce_encode_data encode;
    struct _reduce_decode_data decode;
  } u;
  void *aux_data;
  uint8_t ok_p;
  uint64_t check_hash;
  uint32_t curr_num, buf_bound;
  uint8_t buf[_REDUCE_BUF_LEN];
};

static inline uint32_t _reduce_min (uint32_t a, uint32_t b) { return a < b ? a : b; }

static inline uint32_t _reduce_get_new_el (struct reduce_data *data) {
  struct _reduce_encode_data *encode_data = &data->u.encode;
  uint32_t res = encode_data->el_free;

  if (res != UINT32_MAX) encode_data->el_free = encode_data->table[res].next;
  return res;
}

static inline void _reduce_put (struct reduce_data *data, int byte) {
  uint8_t u = byte;

  if (data->u.encode.writer (&u, 1, data->aux_data) != 1) data->ok_p = FALSE;
}

static inline int _reduce_get (reduce_reader_t reader, void *aux_data) {
  uint8_t u;

  if (reader (&u, 1, aux_data) != 1) return -1;
  return u;
}

static inline uint32_t _reduce_ref_offset_size (uint32_t offset) {
  return offset < (1 << 7) ? 1 : offset < (1 << 14) ? 2 : offset < (1 << 21) ? 3 : 4;
}

static inline uint32_t _reduce_ref_size (uint32_t len, uint32_t offset) {
  assert (len >= _REDUCE_START_LEN);
  len -= _REDUCE_START_LEN - 1;
  return ((len < _REDUCE_REF_TAG_LONG ? 0 : _reduce_ref_offset_size (len))
          + _reduce_ref_offset_size (offset));
}

static inline void _reduce_uint_write (struct reduce_data *data, uint32_t u) {
  int n;

  assert (u < (1 << 7 * 4));
  for (n = 1; n <= 4 && u >= (1 << 7 * n); n++)
    ;
  _reduce_put (data, (1 << (8 - n)) | ((u >> (n - 1) * 8) & 0xff)); /* tag */
  for (int i = 2; i <= n; i++) _reduce_put (data, (u >> (n - i) * 8) & 0xff);
}

static inline int64_t _reduce_uint_read (reduce_reader_t reader, void *aux_data) {
  int i, n, r = _reduce_get (reader, aux_data);
  uint32_t u, v;

  if (r < 0) return -1;
  for (u = (uint32_t) r, n = 1; n <= 4 && (u >> (8 - n)) != 1; n++)
    ;
  assert ((u >> (8 - n)) == 1);
  v = u & (0xff >> n);
  for (i = 1; i < n; i++) {
    if ((r = _reduce_get (reader, aux_data)) < 0) return -1;
    v = v * 256 + (uint32_t) r;
  }
  return v;
}

static inline void _reduce_hash_write (struct reduce_data *data, uint64_t h) {
  _reduce_put (data, 0); /* 0 tag */
  for (int i = 0; i < sizeof (uint64_t); i++) _reduce_put (data, (h >> i * 8) & 0xff);
}

static inline uint64_t _reduce_str2hash (const uint8_t *s) {
  uint64_t h = 0;

  for (int i = 0; i < sizeof (uint64_t); i++) h |= (uint64_t) s[i] << i * 8;
  return h;
}

static inline int _reduce_symb_flush (struct reduce_data *data, int ref_tag) {
  uint8_t u;
  struct _reduce_encode_data *encode_data = &data->u.encode;
  uint32_t len = encode_data->curr_symb_len;

  if (len == 0 && ref_tag == 0) return FALSE;
  u = ((len < _REDUCE_SYMB_TAG_LONG ? len : _REDUCE_SYMB_TAG_LONG) << _REDUCE_REF_TAG_LEN)
      | ref_tag;
  encode_data->writer (&u, 1, data->aux_data);
  if (len >= _REDUCE_SYMB_TAG_LONG) _reduce_uint_write (data, len);
  encode_data->writer (encode_data->curr_symb, len, data->aux_data);
  encode_data->curr_symb_len = 0;
  return TRUE;
}

static inline void _reduce_output_byte (struct reduce_data *data, uint32_t pos) {
  struct _reduce_encode_data *encode_data = &data->u.encode;

  if (encode_data->curr_symb_len + 1 > _REDUCE_MAX_SYMB_LEN) {
    _reduce_symb_flush (data, 0);
    encode_data->curr_symb_len = 0;
  }
  encode_data->curr_symb[encode_data->curr_symb_len++] = data->buf[pos];
}

static inline void _reduce_output_ref (struct reduce_data *data, uint32_t offset, uint32_t len) {
  uint32_t ref_tag;

  assert (len >= _REDUCE_START_LEN);
  len -= _REDUCE_START_LEN - 1;
  ref_tag = len < _REDUCE_REF_TAG_LONG ? len : _REDUCE_REF_TAG_LONG;
  _reduce_symb_flush (data, ref_tag);
  if (len >= _REDUCE_REF_TAG_LONG) _reduce_uint_write (data, len);
  _reduce_uint_write (data, offset);
}

#define _REDUCE_HASH_SEED 24

static inline uint32_t _reduce_dict_find_longest (struct reduce_data *data, uint32_t pos,
                                                  uint32_t *dict_pos) {
  uint32_t len, best_len, len_bound;
  uint64_t hash;
  uint32_t off, ref_size, best_ref_size;
  uint32_t curr, next;
  const uint8_t *s1, *s2;
  struct _reduce_el *el, *best_el = NULL;
  struct _reduce_encode_data *encode_data = &data->u.encode;

  if (pos + _REDUCE_START_LEN > data->buf_bound) return 0;
  /* To have the same compressed output independently of the target
     and the used compiler, use strict hash even if it decreases
     compression speed by 10%.  */
  hash
    = mir_hash_strict (&data->buf[pos], _REDUCE_START_LEN, _REDUCE_HASH_SEED) % _REDUCE_TABLE_SIZE;
  for (curr = encode_data->table[hash].head; curr != UINT32_MAX; curr = next) {
    next = encode_data->table[curr].next;
    el = &encode_data->table[curr];
    len_bound = _reduce_min (data->buf_bound - pos, pos - el->pos);
    if (len_bound < _REDUCE_START_LEN) continue;
    s1 = &data->buf[el->pos];
    s2 = &data->buf[pos];
#if MIR_HASH_UNALIGNED_ACCESS
    assert (_REDUCE_START_LEN >= 4);
    if (*(uint32_t *) &s1[0] != *(uint32_t *) &s2[0]) continue;
    len = 4;
#else
    len = 0;
#endif
    for (; len < len_bound; len++)
      if (s1[len] != s2[len]) break;
#if !MIR_HASH_UNALIGNED_ACCESS
    if (len < _REDUCE_START_LEN) continue;
#endif
    off = data->curr_num - el->num;
    if (best_el == NULL) {
      best_len = len;
      best_el = el;
      best_ref_size = _reduce_ref_size (len, off);
      continue;
    }
    ref_size = _reduce_ref_size (len, off);
    if (best_len + ref_size < len + best_ref_size) {
      best_len = len;
      best_el = el;
      best_ref_size = ref_size;
    }
  }
  if (best_el == NULL) return 0;
  *dict_pos = best_el->num;
  return best_len;
}

static inline void _reduce_dict_add (struct reduce_data *data, uint32_t pos) {
  uint64_t hash;
  uint32_t prev, curr, num = data->curr_num++;
  struct _reduce_encode_data *encode_data = &data->u.encode;

  if (pos + _REDUCE_START_LEN > data->buf_bound) return;
  hash
    = mir_hash_strict (&data->buf[pos], _REDUCE_START_LEN, _REDUCE_HASH_SEED) % _REDUCE_TABLE_SIZE;
  if ((curr = _reduce_get_new_el (data)) == UINT32_MAX) { /* rare case: use last if any */
    for (prev = UINT32_MAX, curr = encode_data->table[hash].head;
         curr != UINT32_MAX && encode_data->table[curr].next != UINT32_MAX;
         prev = curr, curr = encode_data->table[curr].next)
      ;
    if (curr == UINT32_MAX) return; /* no more free els */
    if (prev != UINT32_MAX)
      encode_data->table[prev].next = encode_data->table[curr].next;
    else
      encode_data->table[hash].head = encode_data->table[curr].next;
  }
  encode_data->table[curr].pos = pos;
  encode_data->table[curr].num = num;
  encode_data->table[curr].next = encode_data->table[hash].head;
  encode_data->table[hash].head = curr;
}

static void _reduce_reset_next (struct reduce_data *data) {
  struct _reduce_encode_data *encode_data = &data->u.encode;

  for (uint32_t i = 0; i < _REDUCE_TABLE_SIZE; i++) {
    encode_data->table[i].next = i + 1;
    encode_data->table[i].head = UINT32_MAX;
  }
  encode_data->table[_REDUCE_TABLE_SIZE - 1].next = UINT32_MAX;
  encode_data->el_free = 0;
}

#define _REDUCE_CHECK_HASH_SEED 42

static inline struct reduce_data *reduce_encode_start (reduce_writer_t writer, void *aux_data) {
  struct reduce_data *data = malloc (sizeof (struct reduce_data));
  char prefix[] = _REDUCE_DATA_PREFIX;
  size_t prefix_size = strlen (prefix);

  if (data == NULL) return data;
  data->u.encode.writer = writer;
  data->aux_data = aux_data;
  data->check_hash = _REDUCE_CHECK_HASH_SEED;
  data->buf_bound = 0;
  data->ok_p = writer (prefix, prefix_size, aux_data) == prefix_size;
  return data;
}

static inline void _reduce_encode_buf (struct reduce_data *data) {
  uint32_t dict_len, dict_pos, base;

  if (data->buf_bound == 0) return;
  data->check_hash = mir_hash_strict (data->buf, data->buf_bound, data->check_hash);
  data->curr_num = data->u.encode.curr_symb_len = 0;
  _reduce_reset_next (data);
  for (uint32_t pos = 0; pos < data->buf_bound;) {
    dict_len = _reduce_dict_find_longest (data, pos, &dict_pos);
    base = data->curr_num;
    if (dict_len == 0) {
      _reduce_output_byte (data, pos);
      _reduce_dict_add (data, pos);
      pos++;
      continue;
    }
    _reduce_output_ref (data, base - dict_pos, dict_len);
    _reduce_dict_add (data, pos); /* replace */
    pos += dict_len;
  }
  _reduce_symb_flush (data, 0);
}

static inline void reduce_encode_put (struct reduce_data *data, int c) {
  if (data->buf_bound < _REDUCE_BUF_LEN) {
    data->buf[data->buf_bound++] = c;
    return;
  }
  _reduce_encode_buf (data);
  data->buf_bound = 0;
  data->buf[data->buf_bound++] = c;
}

static inline int reduce_encode_finish (struct reduce_data *data) {
  int ok_p;

  _reduce_encode_buf (data);
  _reduce_hash_write (data, data->check_hash);
  ok_p = data->ok_p;
  free (data);
  return ok_p;
}

static inline struct reduce_data *reduce_decode_start (reduce_reader_t reader, void *aux_data) {
  struct reduce_data *data = malloc (sizeof (struct reduce_data));
  struct _reduce_decode_data *decode_data = &data->u.decode;
  char prefix[] = _REDUCE_DATA_PREFIX, str[sizeof (prefix)];
  size_t prefix_size = strlen (prefix);

  if (data == NULL) return data;
  decode_data->reader = reader;
  data->aux_data = aux_data;
  data->check_hash = _REDUCE_CHECK_HASH_SEED;
  decode_data->buf_get_pos = data->buf_bound = 0;
  data->ok_p
    = reader (str, prefix_size, aux_data) == prefix_size && memcmp (prefix, str, prefix_size) == 0;
  decode_data->eof_p = FALSE;
  return data;
}

static inline int reduce_decode_get (struct reduce_data *data) {
  uint8_t tag, hash_str[sizeof (uint64_t)];
  uint32_t sym_len, ref_len, ref_ind, sym_pos, pos = 0, curr_ind = 0;
  int64_t r;
  struct _reduce_decode_data *decode_data = &data->u.decode;
  reduce_reader_t reader = decode_data->reader;

  if (decode_data->buf_get_pos < data->buf_bound) return data->buf[decode_data->buf_get_pos++];
  if (decode_data->eof_p) return -1;
  for (;;) {
    if (reader (&tag, 1, data->aux_data) == 0) break;
    if (tag == 0) { /* check hash */
      if (reader (hash_str, sizeof (hash_str), data->aux_data) != sizeof (hash_str)
          || reader (&tag, 1, data->aux_data) != 0)
        break;
      if (pos != 0) data->check_hash = mir_hash_strict (data->buf, pos, data->check_hash);
      if (_reduce_str2hash (hash_str) != data->check_hash) break;
      decode_data->eof_p = TRUE;
      decode_data->buf_get_pos = 0;
      data->buf_bound = pos;
      return pos == 0 ? -1 : data->buf[decode_data->buf_get_pos++];
    }
    sym_len = tag >> _REDUCE_REF_TAG_LEN;
    if (sym_len != 0) {
      if (sym_len == _REDUCE_SYMB_TAG_LONG) {
        if ((r = _reduce_uint_read (reader, data->aux_data)) < 0) break;
        sym_len = r;
      }
      if (sym_len > _REDUCE_MAX_SYMB_LEN || pos + sym_len > _REDUCE_BUF_LEN) break;
      if (reader (&data->buf[pos], sym_len, data->aux_data) != sym_len) break;
      for (uint32_t i = 0; i < sym_len; i++, pos++, curr_ind++)
        decode_data->ind2pos[curr_ind] = pos;
    }
    ref_len = tag & _REDUCE_REF_TAG_LONG;
    if (ref_len != 0) {
      if (ref_len == _REDUCE_REF_TAG_LONG) {
        if ((r = _reduce_uint_read (reader, data->aux_data)) < 0) break;
        ref_len = r;
      }
      ref_len += _REDUCE_START_LEN - 1;
      if ((r = _reduce_uint_read (reader, data->aux_data)) < 0) break;
      ref_ind = r;
      if (curr_ind < ref_ind) break;
      sym_pos = decode_data->ind2pos[curr_ind - ref_ind];
      if (sym_pos + ref_len > _REDUCE_BUF_LEN) break;
      memcpy (&data->buf[pos], &data->buf[sym_pos], ref_len);
      decode_data->ind2pos[curr_ind++] = pos;
      pos += ref_len;
    }
    if (pos >= _REDUCE_BUF_LEN) {
      assert (pos == _REDUCE_BUF_LEN);
      data->check_hash = mir_hash_strict (data->buf, pos, data->check_hash);
      data->buf_bound = _REDUCE_BUF_LEN;
      decode_data->buf_get_pos = 0;
      return data->buf[decode_data->buf_get_pos++];
    }
  }
  data->ok_p = FALSE;
  return -1;
}

static inline int reduce_decode_finish (struct reduce_data *data) {
  uint8_t tag;
  int ok_p
    = data->ok_p && data->u.decode.eof_p && data->u.decode.reader (&tag, 1, data->aux_data) == 0;

  free (data);
  return ok_p;
}

#define _REDUCE_WRITE_IO_LEN 256
static inline int reduce_encode (reduce_reader_t reader, reduce_writer_t writer, void *aux_data) {
  size_t i, size;
  uint8_t buf[_REDUCE_WRITE_IO_LEN];
  struct reduce_data *data = reduce_encode_start (writer, aux_data);

  if (data == NULL) return FALSE;
  for (;;) {
    if ((size = reader (buf, _REDUCE_WRITE_IO_LEN, data->aux_data)) == 0) break;
    for (i = 0; i < size; i++) reduce_encode_put (data, buf[i]);
  }
  return reduce_encode_finish (data);
}

static inline int reduce_decode (reduce_reader_t reader, reduce_writer_t writer, void *aux_data) {
  int c, i;
  uint8_t buf[_REDUCE_WRITE_IO_LEN];
  struct reduce_data *data = reduce_decode_start (reader, aux_data);

  if (data == NULL) return FALSE;
  for (;;) {
    for (i = 0; i < _REDUCE_WRITE_IO_LEN && (c = reduce_decode_get (data)) >= 0; i++) buf[i] = c;
    if (i != 0) writer (buf, i, aux_data);
    if (c < 0) break;
  }
  return reduce_decode_finish (data);
}

#endif /* #ifndef MIR_REDUCE_H */