Add new (fast) dictionary generator engine. (#616)

Add CLI for dictionary generation.
Add BUILD file for research folder
diff --git a/research/BUILD b/research/BUILD
new file mode 100755
index 0000000..6ff5ac2
--- /dev/null
+++ b/research/BUILD
@@ -0,0 +1,29 @@
+# Description: brotli research tools.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # MIT
+
+cc_library(
+    name = "dm",
+    srcs = ["deorummolae.cc"],
+    hdrs = [
+        "deorummolae.h",
+        "esaxx/sais.hxx",
+    ],
+)
+
+cc_library(
+    name = "sieve",
+    srcs = ["sieve.cc"],
+    hdrs = ["sieve.h"],
+)
+
+cc_binary(
+    name = "dictionary_generator",
+    srcs = ["dictionary_generator.cc"],
+    deps = [
+        ":dm",
+        ":sieve",
+    ],
+)
diff --git a/research/deorummolae.cc b/research/deorummolae.cc
index f4cc53e..c53a53c 100644
--- a/research/deorummolae.cc
+++ b/research/deorummolae.cc
@@ -1,7 +1,7 @@
 #include "./deorummolae.h"
 
 #include <array>
-#include <vector>
+#include <cstdio>
 
 #include "./esaxx/sais.hxx"
 
@@ -20,9 +20,7 @@
 /* File coverage: every bit set to 1 denotes a file covered by an isle. */
 typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
 
-static int popcount(uint64_t u) {
-  return __builtin_popcountll(u);
-}
+static int popcount(uint64_t u) { return __builtin_popcountll(u); }
 
 /* Condense terminators and pad file entries. */
 static void rewriteText(std::vector<int>* text) {
@@ -155,9 +153,8 @@
   }
 }
 
-size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
-    size_t num_samples, const size_t* sample_sizes,
-    const uint8_t* sample_data) {
+std::string DM_generate(size_t dictionary_size_limit,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
   {
     uint64_t tmp = 0;
     if (popcount(tmp - 1u) != 64) {
@@ -169,9 +166,11 @@
   /* Could use 256 + '0' for easier debugging. */
   int next_terminator = 256;
 
+  std::string output;
   std::vector<std::vector<int>> data;
 
   size_t offset = 0;
+  size_t num_samples = sample_sizes.size();
   if (num_samples > MAX_FILES) num_samples = MAX_FILES;
   for (size_t n = 0; n < num_samples; ++n) {
     size_t next_offset = offset + sample_sizes[n];
@@ -208,7 +207,7 @@
     buildLcp(&full_text, &sa, &lcp, &invese_sa);
 
     /* Do not rebuild SA/LCP, just use different selection. */
-retry:
+  retry:
     best_cost = 0;
     best_isle = {0, 0, 0, {{0}}};
     isles.resize(0);
@@ -259,18 +258,16 @@
     }
 
     /* Save the entry. */
-    fprintf(stderr,
-      "Savings: %zu+%zu, dictionary: %zu+%d\n",
-      total_cost, best_cost, total, best_isle.lcp);
-    for (size_t i = 0; i < best_isle.lcp; ++i) {
-      dictionary[total + i] =
-          static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
-    }
+    fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n",
+        total_cost, best_cost, total, best_isle.lcp);
+    int* piece = &full_text[sa[best_isle.l]];
+    output.insert(output.end(), piece, piece + best_isle.lcp);
     total += best_isle.lcp;
     total_cost += best_cost;
-    cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
-        &invese_sa, &next_terminator, &file_map, &file_offset);
+    cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
+        &next_terminator, &file_map, &file_offset);
     if (total >= dictionary_size_limit) break;
   }
-  return total;
+
+  return output;
 }
diff --git a/research/deorummolae.h b/research/deorummolae.h
index f37015c..7f24add 100644
--- a/research/deorummolae.h
+++ b/research/deorummolae.h
@@ -4,6 +4,9 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <string>
+#include <vector>
+
 /* log2(maximal number of files). Value 6 provides some speedups. */
 #define LOG_MAX_FILES 6
 
@@ -13,15 +16,12 @@
 /**
  * Generate a dictionary for given samples.
  *
- * @param dictionary storage for generated dictionary
  * @param dictionary_size_limit maximal dictionary size
- * @param num_samples number of samples
- * @param sample_sizes array with sample sizes
+ * @param sample_sizes vector with sample sizes
  * @param sample_data concatenated samples
- * @return generated dictionary size
+ * @return generated dictionary
  */
-size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
-    size_t num_samples, const size_t* sample_sizes,
-    const uint8_t* sample_data);
+std::string DM_generate(size_t dictionary_size_limit,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
 
 #endif  // BROTLI_RESEARCH_DEORUMMOLAE_H_
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
new file mode 100755
index 0000000..b3ee89c
--- /dev/null
+++ b/research/dictionary_generator.cc
@@ -0,0 +1,153 @@
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <vector>
+
+#include "./deorummolae.h"
+#include "./sieve.h"
+
+#define METHOD_DM 0
+#define METHOD_SIEVE 1
+
+size_t readInt(const char* str) {
+  size_t result = 0;
+  if (str[0] == 0 || str[0] == '0') {
+    return 0;
+  }
+  for (size_t i = 0; i < 13; ++i) {
+    if (str[i] == 0) {
+      return result;
+    }
+    if (str[i] == 'k' || str[i] == 'K') {
+      if ((str[i + 1] == 0) && ((result << 10) > result)) {
+        return result << 10;
+      }
+      return 0;
+    }
+    if (str[i] == 'm' || str[i] == 'M') {
+      if ((str[i + 1] == 0) && ((result << 20) > result)) {
+        return result << 20;
+      }
+      return 0;
+    }
+    if (str[i] < '0' || str[i] > '9') {
+      return 0;
+    }
+    size_t next = (10 * result) + (str[i] - '0');
+    if (next <= result) {
+      return 0;
+    }
+    result = next;
+  }
+  return 0;
+}
+
+static std::string readFile(const std::string& path) {
+  std::ifstream file(path);
+  std::string content(
+      (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+  return content;
+}
+
+static void writeFile(const char* file, const std::string& content) {
+  std::ofstream outfile(file, std::ofstream::binary);
+  outfile.write(content.c_str(), content.size());
+  outfile.close();
+}
+
+/* Returns "base file name" or its tail, if it contains '/' or '\'. */
+static const char* fileName(const char* path) {
+  const char* separator_position = strrchr(path, '/');
+  if (separator_position) path = separator_position + 1;
+  separator_position = strrchr(path, '\\');
+  if (separator_position) path = separator_position + 1;
+  return path;
+}
+
+static void printHelp(const char* name) {
+  fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
+  fprintf(stderr,
+      "Options:\n"
+      "  --dm     use 'deorummolae' engine\n"
+      "  --sieve  use 'sieve' engine (default)\n"
+      "  -t#      set target dictionary size (limit); default: 16K\n"
+      "  -s#      set slize length for 'sieve'; default: 33\n"
+      "# is a decimal number with optional k/K/m/M suffix.\n\n");
+}
+
+int main(int argc, char const* argv[]) {
+  int dictionaryArg = -1;
+  int method = METHOD_SIEVE;
+  int sieveSliceLen = 33;
+  int targetSize = 16 << 10;
+
+  std::vector<uint8_t> data;
+  std::vector<size_t> sizes;
+  size_t total = 0;
+  for (int i = 1; i < argc; ++i) {
+    if (argv[i] == nullptr) {
+      continue;
+    }
+    if (argv[i][0] == '-') {
+      if (argv[i][1] == '-') {
+        if (std::strcmp("--sieve", argv[i]) == 0) {
+          method = METHOD_SIEVE;
+          continue;
+        }
+        if (std::strcmp("--dm", argv[i]) == 0) {
+          method = METHOD_DM;
+          continue;
+        }
+        printHelp(fileName(argv[0]));
+        fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+        exit(1);
+      }
+      if (argv[i][1] == 's') {
+        sieveSliceLen = readInt(&argv[i][2]);
+        if (sieveSliceLen < 4 || sieveSliceLen > 256) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
+      } else if (argv[i][1] == 't') {
+        targetSize = readInt(&argv[i][2]);
+        if (targetSize < 256 || targetSize > (1 << 25)) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
+      } else {
+        printHelp(fileName(argv[0]));
+        fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
+        exit(1);
+      }
+      continue;
+    }
+    if (dictionaryArg == -1) {
+      dictionaryArg = i;
+      continue;
+    }
+    std::string content = readFile(argv[i]);
+    data.insert(data.end(), content.begin(), content.end());
+    total += content.size();
+    sizes.push_back(content.size());
+  }
+  if (dictionaryArg == -1 || total == 0) {
+    printHelp(fileName(argv[0]));
+    fprintf(stderr, "Not enough arguments\n");
+    exit(1);
+  }
+
+  if (method == METHOD_SIEVE) {
+    writeFile(argv[dictionaryArg],
+        sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
+  } else if (method == METHOD_DM) {
+    writeFile(argv[dictionaryArg],
+        DM_generate(targetSize, sizes, data.data()));
+  } else {
+    printHelp(fileName(argv[0]));
+    fprintf(stderr, "Unknown generator\n");
+    exit(1);
+  }
+  return 0;
+}
diff --git a/research/sieve.cc b/research/sieve.cc
new file mode 100755
index 0000000..fbc1dbf
--- /dev/null
+++ b/research/sieve.cc
@@ -0,0 +1,211 @@
+#include "./sieve.h"
+
+typedef struct Slot {
+  uint32_t next;
+  uint32_t offset;
+  uint16_t presence;
+  uint16_t mark;
+} Slot;
+
+static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
+    size_t middle, uint16_t minPresence, uint16_t iteration) {
+  int from = -2;
+  int to = -1;
+  size_t result = 0;
+  uint16_t targetPresence = minPresence;
+  for (uint32_t i = 0; i < end; ++i) {
+    if (i == middle) {
+      targetPresence++;
+    }
+    Slot& item = map[shortcut[i]];
+    if (item.mark != iteration) {
+      item.mark = iteration;
+      if (item.presence >= targetPresence) {
+        if (to < i) {
+          if (from > 0) {
+            result += to - from;
+          }
+          from = i;
+        }
+        to = i + sliceLen;
+      }
+    }
+  }
+  if (from > 0) {
+    result += to - from;
+  }
+  return result;
+}
+
+static std::string createDictionary(const uint8_t* data, size_t sliceLen,
+    Slot* map, uint32_t* shortcut, size_t end, size_t middle,
+    uint16_t minPresence, uint16_t iteration) {
+  std::string output;
+  int from = -2;
+  int to = -1;
+  uint16_t targetPresence = minPresence;
+  for (uint32_t i = 0; i < end; ++i) {
+    if (i == middle) {
+      targetPresence++;
+    }
+    Slot& item = map[shortcut[i]];
+    if (item.mark != iteration) {
+      item.mark = iteration;
+      if (item.presence >= targetPresence) {
+        if (to < i) {
+          if (from > 0) {
+            output.insert(output.end(), &data[from], &data[to]);
+          }
+          from = i;
+        }
+        to = i + sliceLen;
+      }
+    }
+  }
+  if (from > 0) {
+    output.insert(output.end(), &data[from], &data[to]);
+  }
+  return output;
+}
+
+std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+  /* Parameters aliasing */
+  size_t targetSize = dictionary_size_limit;
+  size_t sliceLen = slice_len;
+  const uint8_t* data = sample_data;
+
+  size_t total = 0;
+  std::vector<size_t> offsets;
+  for (size_t i = 0; i < sample_sizes.size(); ++i) {
+    total += sample_sizes[i];
+    offsets.push_back(total);
+  }
+
+  /*****************************************************************************
+   * Build coverage map.
+   ****************************************************************************/
+  std::vector<Slot> map;
+  std::vector<uint32_t> shortcut;
+  map.push_back({0, 0, 0, 0});
+  size_t end = total - sliceLen;
+  int hashLen = 8;
+  while ((1 << hashLen) < end) {
+    hashLen += 3;
+  }
+  hashLen -= 3;
+  uint32_t hashMask = (1u << hashLen) - 1u;
+  std::vector<uint32_t> hashHead(1 << hashLen);
+  uint32_t hashSlot = 1;
+  uint16_t piece = 0;
+  uint32_t hash = 0;
+  int lShift = 3;
+  int rShift = hashLen - lShift;
+  for (int i = 0; i < sliceLen - 1; ++i) {
+    uint32_t v = data[i];
+    hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+  }
+  int lShiftX = (lShift * (sliceLen - 1)) % hashLen;
+  int rShiftX = hashLen - lShiftX;
+  for (uint32_t i = 0; i < end; ++i) {
+    uint32_t v = data[i + sliceLen - 1];
+    hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+
+    if (offsets[piece] == i) {
+      piece++;
+    }
+    uint32_t slot = hashHead[hash];
+    while (slot != 0) {
+      Slot& item = map[slot];
+      int start = item.offset;
+      bool miss = false;
+      for (size_t j = 0; j < sliceLen; ++j) {
+        if (data[i + j] != data[start + j]) {
+          miss = true;
+          break;
+        }
+      }
+      if (!miss) {
+        if (item.mark != piece) {
+          item.mark = piece;
+          item.presence++;
+        }
+        shortcut.push_back(slot);
+        break;
+      }
+      slot = item.next;
+    }
+    if (slot == 0) {
+      map.push_back({hashHead[hash], i, 1, piece});
+      hashHead[hash] = hashSlot;
+      shortcut.push_back(hashSlot);
+      hashSlot++;
+    }
+    v = data[i];
+    hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
+  }
+
+  /*****************************************************************************
+   * Build dictionary of specified size.
+   ****************************************************************************/
+  size_t a = 1;
+  size_t size = dryRun(
+      sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
+  /* Maximal output is smaller than target. */
+  if (size <= targetSize) {
+    return createDictionary(
+        data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
+  }
+
+  size_t b = offsets.size();
+  size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
+  if (size == targetSize) {
+    return createDictionary(
+        data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
+  }
+  /* Run binary search. */
+  if (size < targetSize) {
+    /* size(a) > targetSize > size(b) && a < m < b */
+    while (a + 1 < b) {
+      size_t m = (a + b) / 2;
+      size = dryRun(
+          sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
+      if (size < targetSize) {
+        b = m;
+      } else if (size > targetSize) {
+        a = m;
+      } else {
+        return createDictionary(
+            data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
+      }
+    }
+  } else {
+    a = b;
+  }
+  /* size(minPresence) > targetSize > size(minPresence + 1) */
+  size_t minPresence = a;
+  a = 0;
+  b = end;
+  /* size(a) < targetSize < size(b) && a < m < b */
+  while (a + 1 < b) {
+    size_t m = (a + b) / 2;
+    size = dryRun(
+        sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
+    if (size < targetSize) {
+      a = m;
+    } else if (size > targetSize) {
+      b = m;
+    } else {
+      return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
+          m, minPresence, ++piece);
+    }
+  }
+
+  bool unrestricted = false;
+  if (minPresence <= 2 && !unrestricted) {
+    minPresence = 2;
+    a = end;
+  }
+  return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a,
+      minPresence, ++piece);
+}
diff --git a/research/sieve.h b/research/sieve.h
new file mode 100755
index 0000000..f529835
--- /dev/null
+++ b/research/sieve.h
@@ -0,0 +1,21 @@
+#ifndef BROTLI_RESEARCH_SIEVE_H_
+#define BROTLI_RESEARCH_SIEVE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+/**
+ * Generate a dictionary for given samples.
+ *
+ * @param dictionary_size_limit maximal dictionary size
+ * @param sample_sizes vector with sample sizes
+ * @param sample_data concatenated samples
+ * @return generated dictionary
+ */
+std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
+
+#endif  // BROTLI_RESEARCH_SIEVE_H_