Research (#491)

* add advanced mode for optimal references generator
 * fix #489

Thanks to Ivan Nikulin for working on it.
diff --git a/research/draw_diff.cc b/research/draw_diff.cc
index 837e5d2..01b6716 100644
--- a/research/draw_diff.cc
+++ b/research/draw_diff.cc
@@ -84,16 +84,17 @@
     return 1;
   }
 
-  FILE* fimage1 = fopen(argv[1], "rb");
-  FILE* fimage2 = fopen(argv[2], "rb");
-  FILE* fdiff = fopen(argv[3], "wb");
-
   uint8_t **image1, **image2;
   size_t h1, w1, h2, w2;
+
+  FILE* fimage1 = fopen(argv[1], "rb");
   ReadPGM(fimage1, &image1, &h1, &w1);
-  ReadPGM(fimage2, &image2, &h2, &w2);
   fclose(fimage1);
+
+  FILE* fimage2 = fopen(argv[2], "rb");
+  ReadPGM(fimage2, &image2, &h2, &w2);
   fclose(fimage2);
+
   if (!(h1 == h2 && w1 == w2)) {
     printf("Images must have the same size.\n");
     return 1;
@@ -103,7 +104,9 @@
   for (size_t i = 0; i < h1; ++i) diff[i] = new int[w1];
   CalculateDiff(diff, image1, image2, h1, w1);
 
+  FILE* fdiff = fopen(argv[3], "wb");
   DrawDiff(diff, image1, image2, h1, w1, fdiff);
   fclose(fdiff);
+
   return 0;
 }
diff --git a/research/find_opt_references.cc b/research/find_opt_references.cc
index beab5b6..378d3b5 100644
--- a/research/find_opt_references.cc
+++ b/research/find_opt_references.cc
@@ -12,13 +12,23 @@
 #include <cassert>
 #include <cstdio>
 #include <cstring>
+#include <functional>
+#include <utility>
+#include <vector>
 
 #include <gflags/gflags.h>
 using gflags::ParseCommandLineFlags;
 
 #include "./esaxx/sais.hxx"
 
+DEFINE_bool(advanced, false, "Advanced searching mode: finds all longest "
+    "matches at positions that are not covered by matches of length at least "
+    "max_length. WARNING: uses much more memory than simple mode, especially "
+    "for small values of min_length.");
 DEFINE_int32(min_length, 1, "Minimal length of found backward references.");
+/* For advanced mode. */
+DEFINE_int32(long_length, 32,
+             "Maximal length of found backward references for advanced mode.");
 DEFINE_int32(skip, 1, "Number of bytes to skip.");
 
 const size_t kFileBufferSize = (1 << 16);  // 64KB
@@ -26,6 +36,9 @@
 typedef int sarray_type;  // Can't make it unsigned because of templates :(
 typedef uint8_t input_type;
 typedef uint32_t lcp_type;
+typedef std::pair<int, std::vector<int> > entry_type;
+typedef std::function<void(sarray_type*, lcp_type*, size_t, int, int, int, int,
+                           int)> Fn;
 
 void ReadInput(FILE* fin, input_type* storage, size_t input_size) {
   size_t last_pos = 0;
@@ -59,12 +72,65 @@
   }
 }
 
-void ProcessReferences(input_type* storage, sarray_type* sarray, lcp_type* lcp,
-                       size_t size, uint32_t* pos, FILE* fout) {
+inline void PrintReference(sarray_type* sarray, lcp_type* lcp, size_t size,
+                           int idx, int left_ix, int right_ix, int left_lcp,
+                           int right_lcp, FILE* fout) {
+  int max_lcp_ix;
+  if (right_ix == size - 1 || (left_ix >= 0 && left_lcp >= right_lcp)) {
+    max_lcp_ix = left_ix;
+  } else {
+    max_lcp_ix = right_ix;
+  }
+  int dist = idx - sarray[max_lcp_ix];
+  assert(dist > 0);
+  fputc(1, fout);
+  fwrite(&idx, sizeof(int), 1, fout);   // Position in input.
+  fwrite(&dist, sizeof(int), 1, fout);  // Backward distance.
+}
+
+inline void GoLeft(sarray_type* sarray, lcp_type* lcp, int idx, int left_ix,
+                   int left_lcp, entry_type* entry) {
+  entry->first = left_lcp;
+  if (left_lcp > FLAGS_long_length) return;
+  for (; left_ix >= 0; --left_ix) {
+    if (lcp[left_ix] < left_lcp) break;
+    if (sarray[left_ix] < idx) {
+      entry->second.push_back(idx - sarray[left_ix]);
+    }
+  }
+}
+
+inline void GoRight(sarray_type* sarray, lcp_type* lcp, int idx, size_t size,
+                    int right_ix, int right_lcp, entry_type* entry) {
+  entry->first = right_lcp;
+  if (right_lcp > FLAGS_long_length) return;
+  for (; right_ix < size - 1; ++right_ix) {
+    if (lcp[right_ix] < right_lcp) break;
+    if (sarray[right_ix] < idx) {
+      entry->second.push_back(idx - sarray[right_ix]);
+    }
+  }
+}
+
+inline void StoreReference(sarray_type* sarray, lcp_type* lcp, size_t size,
+                           int idx, int left_ix, int right_ix, int left_lcp,
+                           int right_lcp, entry_type* entries) {
+  if (right_ix == size - 1 || (left_ix >= 0 && left_lcp > right_lcp)) {
+    // right is invalid or left is better
+    GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
+  } else if (left_ix < 0 || (right_ix < size - 1 && right_lcp > left_lcp)) {
+    // left is invalid or right is better
+    GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
+  } else {  // both are valid and of equal length
+    GoLeft(sarray, lcp, idx, left_ix, left_lcp, &entries[idx]);
+    GoRight(sarray, lcp, idx, size, right_ix, right_lcp, &entries[idx]);
+  }
+}
+
+void ProcessReferences(sarray_type* sarray, lcp_type* lcp, size_t size,
+                       uint32_t* pos, const Fn& process_output) {
   int min_length = FLAGS_min_length;
   for (int idx = FLAGS_skip; idx < size; ++idx) {
-    int max_lcp = -1;
-    int max_lcp_ix;
     int left_lcp = -1;
     int left_ix;
     for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) {
@@ -74,10 +140,6 @@
       if (left_lcp == 0) break;
       if (sarray[left_ix] < idx) break;
     }
-    if (left_ix >= 0) {
-      max_lcp = left_lcp;
-      max_lcp_ix = left_ix;
-    }
 
     int right_lcp = -1;
     int right_ix;
@@ -86,30 +148,48 @@
         right_lcp = lcp[right_ix];
       }
       // Stop if we have better result from the left side already.
-      if (right_lcp < max_lcp) break;
+      if (right_lcp < left_lcp && left_ix >= 0) break;
       if (right_lcp == 0) break;
       if (sarray[right_ix] < idx) break;
     }
-    if (right_lcp > max_lcp && right_ix < size - 1) {
-      max_lcp = right_lcp;
-      max_lcp_ix = right_ix;
-    }
 
-    if (max_lcp >= min_length) {
-      int dist = idx - sarray[max_lcp_ix];
-      if (dist <= 0) {
-        printf("idx = %d, pos[idx] = %u\n", idx, pos[idx]);
-        printf("left_ix = %d, right_ix = %d\n",
-                left_ix, right_ix);
-        printf("left_lcp = %d, right_lcp = %d\n",
-                left_lcp, right_lcp);
-        printf("sarray[left_ix] = %d, sarray[right_ix] = %d\n",
-                sarray[left_ix], sarray[right_ix]);
-        assert(dist > 0);
+    if ((left_ix >= 0 && left_lcp >= min_length) ||
+        (right_ix < size - 1 && right_lcp >= min_length)) {
+      process_output(sarray, lcp, size, idx, left_ix, right_ix, left_lcp,
+                     right_lcp);
+    }
+  }
+}
+
+void ProcessEntries(entry_type* entries, size_t size, FILE* fout) {
+  int long_length = FLAGS_long_length;
+  std::vector<std::pair<int, int> > segments;
+  size_t idx;
+  for (idx = 0; idx < size;) {
+    entry_type& entry = entries[idx];
+    if (entry.first > long_length) {
+      // Add segment.
+      if (segments.empty() || segments.back().second < idx) {
+        segments.push_back({idx, idx + entry.first});
+      } else {
+        segments.back().second = idx + entry.first;
       }
-      fputc(1, fout);
-      fwrite(&idx, sizeof(int), 1, fout);  // Position in input.
-      fwrite(&dist, sizeof(int), 1, fout);  // Backward distance.
+    }
+    ++idx;
+  }
+  printf("Segments generated.\n");
+  size_t segments_ix = 0;
+  for (idx = 0; idx < size;) {
+    if (idx == segments[segments_ix].first) {
+      // Skip segment.
+      idx = segments[segments_ix].second;
+    } else {
+      for (auto& dist : entries[idx].second) {
+        fputc(1, fout);
+        fwrite(&idx, sizeof(int), 1, fout);   // Position in input.
+        fwrite(&dist, sizeof(int), 1, fout);  // Backward distance.
+      }
+      ++idx;
     }
   }
 }
@@ -144,8 +224,44 @@
   lcp_type* lcp = new lcp_type[input_size];
   BuildLCP(storage, sarray, lcp, input_size, pos);
   printf("LCP array constructed.\n");
+  delete[] storage;
 
-  ProcessReferences(storage, sarray, lcp, input_size, pos, fout);
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  using std::placeholders::_3;
+  using std::placeholders::_4;
+  using std::placeholders::_5;
+  using std::placeholders::_6;
+  using std::placeholders::_7;
+  using std::placeholders::_8;
+  entry_type* entries;
+  if (FLAGS_advanced) {
+    entries = new entry_type[input_size];
+    for (size_t i = 0; i < input_size; ++i) entries[i].first = -1;
+  }
+  Fn print = std::bind(PrintReference, _1, _2, _3, _4, _5, _6, _7, _8, fout);
+  Fn store = std::bind(StoreReference, _1, _2, _3, _4, _5, _6, _7, _8, entries);
+
+  ProcessReferences(sarray, lcp, input_size, pos,
+                    FLAGS_advanced ? store : print);
+  printf("References processed.\n");
+
+  if (FLAGS_advanced) {
+    int good_cnt = 0;
+    uint64_t avg_cnt = 0;
+    for (size_t i = 0; i < input_size; ++i) {
+      if (entries[i].first != -1) {
+        ++good_cnt;
+        avg_cnt += entries[i].second.size();
+      }
+    }
+    printf("Number of covered positions = %d\n", good_cnt);
+    printf("Average number of references per covered position = %.4lf\n",
+            static_cast<double>(avg_cnt) / good_cnt);
+    ProcessEntries(entries, input_size, fout);
+    printf("Entries processed.\n");
+  }
+
   fclose(fout);
   return 0;
 }