Merge pull request #12 from delfrrr/remove-recursion

Remove recursion, speed up modulus
diff --git a/.gitignore b/.gitignore
index 7dde577..375f2a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,5 @@
 mason_packages
 .toolchain
 .mason
-local.env
\ No newline at end of file
+local.env
+xcode-project
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 903e417..c7c8f72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,9 @@
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/mason.cmake)
 
 option(WERROR "Add -Werror flag to build (turns warnings into errors)" ON)
+option(BENCHMARK_BIG_O "Calculate Big O in benchmark" OFF)
+option(BENCHMARK_100M "Run against 100M points" OFF)
+option(BENCHMARK_10M "Run against 100M points" OFF)
 
 # configure optimization
 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -50,6 +53,18 @@
 find_package(Threads REQUIRED)
 file(GLOB BENCH_SOURCES bench/*.cpp)
 add_executable(bench-tests ${BENCH_SOURCES})
+if(BENCHMARK_BIG_O)
+    message("-- BENCHMARK_BIG_O=1")
+    target_compile_definitions(bench-tests PUBLIC BENCHMARK_BIG_O=1)
+endif()
+if(BENCHMARK_100M)
+    message("-- BENCHMARK_100M=1")
+    target_compile_definitions(bench-tests PUBLIC BENCHMARK_100M=1)
+endif()
+if(BENCHMARK_10M)
+    message("-- BENCHMARK_10M=1")
+    target_compile_definitions(bench-tests PUBLIC BENCHMARK_10M=1)
+endif()
 
 #examples
 add_executable(triangulate-geojson examples/triangulate_geojson.cpp)
diff --git a/Makefile b/Makefile
index 5abaceb..8a0ac69 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@
 # Whether to turn compiler warnings into errors
 export WERROR ?= true
 export BUILD_DIR ?= cmake-build
+export XCODE_PROJ_DIR ?= xcode-project
 
 default: release
 
@@ -11,6 +12,9 @@
 debug:
 	mkdir -p ./$(BUILD_DIR) && cd ./$(BUILD_DIR) && cmake ../ -DCMAKE_BUILD_TYPE=Debug -DWERROR=$(WERROR) && VERBOSE=1 cmake --build .
 
+xcode:
+	mkdir -p ./$(XCODE_PROJ_DIR) && cd ./$(XCODE_PROJ_DIR) && cmake -G Xcode ../
+
 test:
 	@if [ -f ./$(BUILD_DIR)/unit-tests ]; then ./$(BUILD_DIR)/unit-tests; else echo "Please run 'make release' or 'make debug' first" && exit 1; fi
 
diff --git a/README.md b/README.md
index c747122..921ab45 100644
--- a/README.md
+++ b/README.md
@@ -48,16 +48,16 @@
 
 ```
 Run on (4 X 2300 MHz CPU s)
-2018-09-26 09:28:34
+2018-09-29 09:27:28
 ------------------------------------------------------------
 Benchmark                     Time           CPU Iterations
 ------------------------------------------------------------
-BM_45K_geojson_nodes         24 ms         24 ms         29
-BM_uniform/2000               1 ms          1 ms        887
-BM_uniform/100000            66 ms         66 ms          9
-BM_uniform/200000           158 ms        155 ms          4
-BM_uniform/500000           441 ms        439 ms          2
-BM_uniform/1000000         1062 ms       1058 ms          1
+BM_45K_geojson_nodes         22 ms         22 ms         32
+BM_uniform/2000               1 ms          1 ms        982
+BM_uniform/100000            63 ms         62 ms          9
+BM_uniform/200000           140 ms        140 ms          4
+BM_uniform/500000           400 ms        399 ms          2
+BM_uniform/1000000          994 ms        993 ms          1
 ```
 
 Library is ~10% faster then JS version for 1M uniform points ([details](https://github.com/delfrrr/delaunator-cpp/pull/8#issuecomment-422690056))
diff --git a/bench/run.cpp b/bench/run.cpp
index d48f041..9b449b2 100644
--- a/bench/run.cpp
+++ b/bench/run.cpp
@@ -7,6 +7,7 @@
 
 std::vector<double> generate_uniform(std::size_t n) {
     std::vector<double> coords;
+    coords.reserve(2 * n);
     std::srand(350);
     double norm = static_cast<double>(RAND_MAX) / 1e3;
     for (size_t i = 0; i < n; i++) {
@@ -31,9 +32,20 @@
     while (state.KeepRunning()) {
         delaunator::Delaunator delaunator(coords);
     }
+    state.SetComplexityN(state.range(0));
 }
 
 BENCHMARK(BM_45K_geojson_nodes)->Unit(benchmark::kMillisecond);
 BENCHMARK(BM_uniform)->Arg(2000)->Arg(100000)->Arg(200000)->Arg(500000)->Arg(1000000)->Unit(benchmark::kMillisecond);
 
+#if BENCHMARK_BIG_O
+BENCHMARK(BM_uniform)->RangeMultiplier(2)->Range(1 << 12, 1 << 22)->Unit(benchmark::kMillisecond)->Complexity();
+#endif
+#if BENCHMARK_10M
+BENCHMARK(BM_uniform)->Arg(1000000 * 10)->Unit(benchmark::kMillisecond);
+#endif
+#if BENCHMARK_100M
+BENCHMARK(BM_uniform)->Arg(1000000 * 100)->Unit(benchmark::kMillisecond);
+#endif
+
 BENCHMARK_MAIN()
diff --git a/examples/utils.hpp b/examples/utils.hpp
index 02c32df..d22cb42 100644
--- a/examples/utils.hpp
+++ b/examples/utils.hpp
@@ -22,7 +22,7 @@
     }
 }
 
-inline std::vector<double> get_geo_json_points(std::string const& json) {
+inline std::vector< double> get_geo_json_points(std::string const& json) {
     rapidjson::Document document;
     if(document.Parse(json.c_str()).HasParseError()) {
         throw std::runtime_error("Cannot parse JSON");
diff --git a/include/delaunator.hpp b/include/delaunator.hpp
index e16e0c3..6bc5221 100644
--- a/include/delaunator.hpp
+++ b/include/delaunator.hpp
@@ -11,6 +11,11 @@
 
 namespace delaunator {
 
+//@see https://stackoverflow.com/questions/33333363/built-in-mod-vs-custom-mod-function-improve-the-performance-of-modulus-op/33333636#33333636
+inline size_t fast_mod(const size_t i, const size_t c) {
+    return i >= c ? i % c : i;
+}
+
 // Kahan and Babuska summation, Neumaier variant; accumulates less FP error
 inline double sum(const std::vector<double>& x) {
     double sum = x[0];
@@ -93,47 +98,38 @@
     return std::make_pair(x, y);
 }
 
-inline double compare(
-    std::vector<double> const& coords,
-    std::size_t i,
-    std::size_t j,
-    double cx,
-    double cy) {
-    const double d1 = dist(coords[2 * i], coords[2 * i + 1], cx, cy);
-    const double d2 = dist(coords[2 * j], coords[2 * j + 1], cx, cy);
-    const double diff1 = d1 - d2;
-    const double diff2 = coords[2 * i] - coords[2 * j];
-    const double diff3 = coords[2 * i + 1] - coords[2 * j + 1];
-
-    if (diff1 > 0.0 || diff1 < 0.0) {
-        return diff1;
-    } else if (diff2 > 0.0 || diff2 < 0.0) {
-        return diff2;
-    } else {
-        return diff3;
-    }
-}
-
-struct sort_to_center {
+struct compare {
 
     std::vector<double> const& coords;
     double cx;
     double cy;
 
     bool operator()(std::size_t i, std::size_t j) {
-        return compare(coords, i, j, cx, cy) < 0;
+        const double d1 = dist(coords[2 * i], coords[2 * i + 1], cx, cy);
+        const double d2 = dist(coords[2 * j], coords[2 * j + 1], cx, cy);
+        const double diff1 = d1 - d2;
+        const double diff2 = coords[2 * i] - coords[2 * j];
+        const double diff3 = coords[2 * i + 1] - coords[2 * j + 1];
+
+        if (diff1 > 0.0 || diff1 < 0.0) {
+            return diff1 < 0;
+        } else if (diff2 > 0.0 || diff2 < 0.0) {
+            return diff2 < 0;
+        } else {
+            return diff3 < 0;
+        }
     }
 };
 
 inline bool in_circle(
-    double ax,
-    double ay,
-    double bx,
-    double by,
-    double cx,
-    double cy,
-    double px,
-    double py) {
+    const double ax,
+    const double ay,
+    const double bx,
+    const double by,
+    const double cx,
+    const double cy,
+    const double px,
+    const double py) {
     const double dx = ax - px;
     const double dy = ay - py;
     const double ex = bx - px;
@@ -159,7 +155,7 @@
 }
 
 // monotonically increases with real angle, but doesn't need expensive trigonometry
-inline double pseudo_angle(double dx, double dy) {
+inline double pseudo_angle(const double dx, const double dy) {
     const double p = dx / (std::abs(dx) + std::abs(dy));
     return (dy > 0.0 ? 3.0 - p : 1.0 + p) / 4.0; // [0..1)
 }
@@ -194,9 +190,10 @@
     double m_center_x;
     double m_center_y;
     std::size_t m_hash_size;
+    std::vector<std::size_t> m_edge_stack;
 
     std::size_t legalize(std::size_t a);
-    std::size_t hash_key(double x, double y);
+    std::size_t hash_key(double x, double y) const;
     std::size_t add_triangle(
         std::size_t i0,
         std::size_t i1,
@@ -218,7 +215,8 @@
       m_hash(),
       m_center_x(),
       m_center_y(),
-      m_hash_size() {
+      m_hash_size(),
+      m_edge_stack() {
     std::size_t n = coords.size() >> 1;
 
     double max_x = std::numeric_limits<double>::min();
@@ -305,7 +303,7 @@
     std::tie(m_center_x, m_center_y) = circumcenter(i0x, i0y, i1x, i1y, i2x, i2y);
 
     // sort the points by distance from the seed triangle circumcenter
-    std::sort(ids.begin(), ids.end(), sort_to_center{ coords, m_center_x, m_center_y });
+    std::sort(ids.begin(), ids.end(), compare{ coords, m_center_x, m_center_y });
 
     // initialize a hash table for storing edges of the advancing convex hull
     m_hash_size = static_cast<std::size_t>(std::llround(std::ceil(std::sqrt(n))));
@@ -360,7 +358,7 @@
 
         size_t key = hash_key(x, y);
         for (size_t j = 0; j < m_hash_size; j++) {
-            start = m_hash[(key + j) % m_hash_size];
+            start = m_hash[fast_mod(key + j, m_hash_size)];
             if (start != INVALID_INDEX && start != hull_next[start]) break;
         }
 
@@ -440,84 +438,110 @@
 }
 
 std::size_t Delaunator::legalize(std::size_t a) {
-    const std::size_t b = halfedges[a];
+    std::size_t i = 0;
+    std::size_t ar = 0;
+    m_edge_stack.clear();
 
-    /* if the pair of triangles doesn't satisfy the Delaunay condition
-    * (p1 is inside the circumcircle of [p0, pl, pr]), flip them,
-    * then do the same check/flip recursively for the new pair of triangles
-    *
-    *           pl                    pl
-    *          /||\                  /  \
-    *       al/ || \bl            al/    \a
-    *        /  ||  \              /      \
-    *       /  a||b  \    flip    /___ar___\
-    *     p0\   ||   /p1   =>   p0\---bl---/p1
-    *        \  ||  /              \      /
-    *       ar\ || /br             b\    /br
-    *          \||/                  \  /
-    *           pr                    pr
-    */
-    const std::size_t a0 = a - a % 3;
-    const std::size_t b0 = b - b % 3;
+    // recursion eliminated with a fixed-size stack
+    while (true) {
+        const size_t b = halfedges[a];
 
-    const std::size_t al = a0 + (a + 1) % 3;
-    const std::size_t ar = a0 + (a + 2) % 3;
-    const std::size_t bl = b0 + (b + 2) % 3;
+        /* if the pair of triangles doesn't satisfy the Delaunay condition
+        * (p1 is inside the circumcircle of [p0, pl, pr]), flip them,
+        * then do the same check/flip recursively for the new pair of triangles
+        *
+        *           pl                    pl
+        *          /||\                  /  \
+        *       al/ || \bl            al/    \a
+        *        /  ||  \              /      \
+        *       /  a||b  \    flip    /___ar___\
+        *     p0\   ||   /p1   =>   p0\---bl---/p1
+        *        \  ||  /              \      /
+        *       ar\ || /br             b\    /br
+        *          \||/                  \  /
+        *           pr                    pr
+        */
+        const size_t a0 = 3 * (a / 3);
+        ar = a0 + (a + 2) % 3;
 
-    const std::size_t p0 = triangles[ar];
-    const std::size_t pr = triangles[a];
-    const std::size_t pl = triangles[al];
-    const std::size_t p1 = triangles[bl];
-
-    if (b == INVALID_INDEX) {
-        return ar;
-    }
-
-    const bool illegal = in_circle(
-        coords[2 * p0],
-        coords[2 * p0 + 1],
-        coords[2 * pr],
-        coords[2 * pr + 1],
-        coords[2 * pl],
-        coords[2 * pl + 1],
-        coords[2 * p1],
-        coords[2 * p1 + 1]);
-
-    if (illegal) {
-        triangles[a] = p1;
-        triangles[b] = p0;
-
-        auto hbl = halfedges[bl];
-
-        // edge swapped on the other side of the hull (rare); fix the halfedge reference
-        if (hbl == INVALID_INDEX) {
-            std::size_t e = hull_start;
-            do {
-                if (hull_tri[e] == bl) {
-                    hull_tri[e] = a;
-                    break;
-                }
-                e = hull_next[e];
-            } while (e != hull_start);
+        if (b == INVALID_INDEX) {
+            if (i > 0) {
+                i--;
+                a = m_edge_stack[i];
+                continue;
+            } else {
+                //i = INVALID_INDEX;
+                break;
+            }
         }
-        link(a, hbl);
-        link(b, halfedges[ar]);
-        link(ar, bl);
 
-        std::size_t br = b0 + (b + 1) % 3;
+        const size_t b0 = 3 * (b / 3);
+        const size_t al = a0 + (a + 1) % 3;
+        const size_t bl = b0 + (b + 2) % 3;
 
-        legalize(a);
-        return legalize(br);
+        const std::size_t p0 = triangles[ar];
+        const std::size_t pr = triangles[a];
+        const std::size_t pl = triangles[al];
+        const std::size_t p1 = triangles[bl];
+
+        const bool illegal = in_circle(
+            coords[2 * p0],
+            coords[2 * p0 + 1],
+            coords[2 * pr],
+            coords[2 * pr + 1],
+            coords[2 * pl],
+            coords[2 * pl + 1],
+            coords[2 * p1],
+            coords[2 * p1 + 1]);
+
+        if (illegal) {
+            triangles[a] = p1;
+            triangles[b] = p0;
+
+            auto hbl = halfedges[bl];
+
+            // edge swapped on the other side of the hull (rare); fix the halfedge reference
+            if (hbl == INVALID_INDEX) {
+                std::size_t e = hull_start;
+                do {
+                    if (hull_tri[e] == bl) {
+                        hull_tri[e] = a;
+                        break;
+                    }
+                    e = hull_next[e];
+                } while (e != hull_start);
+            }
+            link(a, hbl);
+            link(b, halfedges[ar]);
+            link(ar, bl);
+            std::size_t br = b0 + (b + 1) % 3;
+
+            if (i < m_edge_stack.size()) {
+                m_edge_stack[i] = br;
+            } else {
+                m_edge_stack.push_back(br);
+            }
+            i++;
+
+        } else {
+            if (i > 0) {
+                i--;
+                a = m_edge_stack[i];
+                continue;
+            } else {
+                break;
+            }
+        }
     }
     return ar;
 }
 
-std::size_t Delaunator::hash_key(double x, double y) {
+inline std::size_t Delaunator::hash_key(const double x, const double y) const {
     const double dx = x - m_center_x;
     const double dy = y - m_center_y;
-    return static_cast<std::size_t>(std::llround(
-               std::floor(pseudo_angle(dx, dy) * static_cast<double>(m_hash_size)))) %
-           m_hash_size;
+    return fast_mod(
+        static_cast<std::size_t>(std::llround(std::floor(pseudo_angle(dx, dy) * static_cast<double>(m_hash_size)))),
+        m_hash_size);
 }
 
 std::size_t Delaunator::add_triangle(
@@ -537,7 +561,7 @@
     return t;
 }
 
-void Delaunator::link(std::size_t a, std::size_t b) {
+void Delaunator::link(const std::size_t a, const std::size_t b) {
     std::size_t s = halfedges.size();
     if (a == s) {
         halfedges.push_back(b);