Merge branch 'master' into bitops
diff --git a/configure.ac b/configure.ac
index d8c38fe..02a4a89 100644
--- a/configure.ac
+++ b/configure.ac
@@ -20,6 +20,7 @@
 # Check for programs
 AC_USE_SYSTEM_EXTENSIONS
 AC_PROG_CC
+AC_PROG_CC_C99
 AM_PROG_CC_C_O
 AC_PROG_CXX
 dnl AX_CXX_COMPILE_STDCXX(11, noext, optional)
@@ -75,7 +76,8 @@
 	AM_CONDITIONAL([ENABLE_GTK_DOC], false)
 ])
 
-# Functions and headers
+# Types, functions, and headers
+AC_CHECK_TYPES(unsigned __int128)
 AC_CHECK_FUNCS(atexit mprotect sysconf getpagesize mmap isatty newlocale strtod_l setlinebuf)
 AC_CHECK_HEADERS(unistd.h sys/mman.h xlocale.h)
 
diff --git a/src/hb-ot-layout-gpos-table.hh b/src/hb-ot-layout-gpos-table.hh
index 14ee371..5b1bc6a 100644
--- a/src/hb-ot-layout-gpos-table.hh
+++ b/src/hb-ot-layout-gpos-table.hh
@@ -99,7 +99,7 @@
 #endif
 
   inline unsigned int get_len (void) const
-  { return _hb_popcount32 ((unsigned int) *this); }
+  { return _hb_popcount ((unsigned int) *this); }
   inline unsigned int get_size (void) const
   { return get_len () * Value::static_size; }
 
diff --git a/src/hb-ot-map.cc b/src/hb-ot-map.cc
index 6f07a7e..54b0ce3 100644
--- a/src/hb-ot-map.cc
+++ b/src/hb-ot-map.cc
@@ -139,7 +139,7 @@
 {
   static_assert ((!(HB_GLYPH_FLAG_DEFINED & (HB_GLYPH_FLAG_DEFINED + 1))), "");
   unsigned int global_bit_mask = HB_GLYPH_FLAG_DEFINED + 1;
-  unsigned int global_bit_shift = _hb_popcount32 (HB_GLYPH_FLAG_DEFINED);
+  unsigned int global_bit_shift = _hb_popcount (HB_GLYPH_FLAG_DEFINED);
 
   m.global_mask = global_bit_mask;
 
diff --git a/src/hb-private.hh b/src/hb-private.hh
index aeed58a..583b561 100644
--- a/src/hb-private.hh
+++ b/src/hb-private.hh
@@ -51,7 +51,7 @@
 #include <stdio.h>
 #include <stdarg.h>
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #endif
 
@@ -322,72 +322,189 @@
 typedef const struct _hb_void_t *hb_void_t;
 #define HB_VOID ((const _hb_void_t *) nullptr)
 
-/* Return the number of 1 bits in mask. */
+/* Return the number of 1 bits in v. */
+template <typename T>
 static inline HB_CONST_FUNC unsigned int
-_hb_popcount32 (uint32_t mask)
+_hb_popcount (T v)
 {
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-  return __builtin_popcount (mask);
-#else
-  /* "HACKMEM 169" */
-  uint32_t y;
-  y = (mask >> 1) &033333333333;
-  y = mask - y - ((y >>1) & 033333333333);
-  return (((y + (y >> 3)) & 030707070707) % 077);
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) && defined(__OPTIMIZE__)
+  if (sizeof (T) <= sizeof (unsigned int))
+    return __builtin_popcount (v);
+
+  if (sizeof (T) <= sizeof (unsigned long))
+    return __builtin_popcountl (v);
+
+  if (sizeof (T) <= sizeof (unsigned long long))
+    return __builtin_popcountll (v);
 #endif
+
+  if (sizeof (T) <= 4)
+  {
+    /* "HACKMEM 169" */
+    uint32_t y;
+    y = (v >> 1) &033333333333;
+    y = v - y - ((y >>1) & 033333333333);
+    return (((y + (y >> 3)) & 030707070707) % 077);
+  }
+
+  if (sizeof (T) == 8)
+  {
+    unsigned int shift = 32;
+    return _hb_popcount<uint32_t> ((uint32_t) v) + _hb_popcount ((uint32_t) (v >> shift));
+  }
+
+  if (sizeof (T) == 16)
+  {
+    unsigned int shift = 64;
+    return _hb_popcount<uint64_t> ((uint64_t) v) + _hb_popcount ((uint64_t) (v >> shift));
+  }
+
+  assert (0);
 }
-static inline HB_CONST_FUNC unsigned int
-_hb_popcount64 (uint64_t mask)
-{
-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-  if (sizeof (long) >= sizeof (mask))
-    return __builtin_popcountl (mask);
-#endif
-  return _hb_popcount32 (mask & 0xFFFFFFFF) + _hb_popcount32 (mask >> 32);
-}
-template <typename T> static inline unsigned int _hb_popcount (T mask);
-template <> inline unsigned int _hb_popcount<uint32_t> (uint32_t mask) { return _hb_popcount32 (mask); }
-template <> inline unsigned int _hb_popcount<uint64_t> (uint64_t mask) { return _hb_popcount64 (mask); }
 
 /* Returns the number of bits needed to store number */
+template <typename T>
 static inline HB_CONST_FUNC unsigned int
-_hb_bit_storage (unsigned int number)
+_hb_bit_storage (T v)
 {
+  if (unlikely (!v)) return 0;
+
 #if defined(__GNUC__) && (__GNUC__ >= 4) && defined(__OPTIMIZE__)
-  return likely (number) ? (sizeof (unsigned int) * 8 - __builtin_clz (number)) : 0;
-#elif defined(_MSC_VER)
-  unsigned long where;
-  if (_BitScanReverse (&where, number)) return 1 + where;
-  return 0;
-#else
-  unsigned int n_bits = 0;
-  while (number) {
-    n_bits++;
-    number >>= 1;
-  }
-  return n_bits;
+  if (sizeof (T) <= sizeof (unsigned int))
+    return sizeof (unsigned int) * 8 - __builtin_clz (v);
+
+  if (sizeof (T) <= sizeof (unsigned long))
+    return sizeof (unsigned long) * 8 - __builtin_clzl (v);
+
+  if (sizeof (T) <= sizeof (unsigned long long))
+    return sizeof (unsigned long long) * 8 - __builtin_clzll (v);
 #endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  if (sizeof (T) <= sizeof (unsigned int))
+  {
+    unsigned long where;
+    _BitScanReverse (&where, v);
+    return 1 + where;
+  }
+# if _WIN64
+  if (sizeof (T) <= 8)
+  {
+    unsigned long where;
+    _BitScanReverse64 (&where, v);
+    return 1 + where;
+  }
+# endif
+#endif
+
+  if (sizeof (T) <= 4)
+  {
+    /* "bithacks" */
+    const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    const unsigned int S[] = {1, 2, 4, 8, 16};
+    unsigned int r = 0;
+    for (int i = 4; i >= 0; i--)
+      if (v & b[i])
+      {
+	v >>= S[i];
+	r |= S[i];
+      }
+    return r + 1;
+  }
+  if (sizeof (T) <= 8)
+  {
+    /* "bithacks" */
+    const uint64_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000};
+    const unsigned int S[] = {1, 2, 4, 8, 16, 32};
+    unsigned int r = 0;
+    for (int i = 5; i >= 0; i--)
+      if (v & b[i])
+      {
+	v >>= S[i];
+	r |= S[i];
+      }
+    return r + 1;
+  }
+  if (sizeof (T) == 16)
+  {
+    unsigned int shift = 64;
+    return (v >> shift) ? _hb_bit_storage<uint64_t> ((uint64_t) v >> shift) + shift :
+			  _hb_bit_storage<uint64_t> ((uint64_t) v);
+  }
+
+  assert (0);
 }
 
-/* Returns the number of zero bits in the least significant side of number */
+/* Returns the number of zero bits in the least significant side of v */
+template <typename T>
 static inline HB_CONST_FUNC unsigned int
-_hb_ctz (unsigned int number)
+_hb_ctz (T v)
 {
+  if (unlikely (!v)) return 0;
+
 #if defined(__GNUC__) && (__GNUC__ >= 4) && defined(__OPTIMIZE__)
-  return likely (number) ? __builtin_ctz (number) : 0;
-#elif defined(_MSC_VER)
-  unsigned long where;
-  if (_BitScanForward (&where, number)) return where;
-  return 0;
-#else
-  unsigned int n_bits = 0;
-  if (unlikely (!number)) return 0;
-  while (!(number & 1)) {
-    n_bits++;
-    number >>= 1;
-  }
-  return n_bits;
+  if (sizeof (T) <= sizeof (unsigned int))
+    return __builtin_ctz (v);
+
+  if (sizeof (T) <= sizeof (unsigned long))
+    return __builtin_ctzl (v);
+
+  if (sizeof (T) <= sizeof (unsigned long long))
+    return __builtin_ctzll (v);
 #endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  if (sizeof (T) <= sizeof (unsigned int))
+  {
+    unsigned long where;
+    _BitScanForward (&where, v);
+    return 1 + where;
+  }
+# if _WIN64
+  if (sizeof (T) <= 8)
+  {
+    unsigned long where;
+    _BitScanForward64 (&where, v);
+    return 1 + where;
+  }
+# endif
+#endif
+
+  if (sizeof (T) <= 4)
+  {
+    /* "bithacks" */
+    unsigned int c = 32;
+    v &= - (int32_t) v;
+    if (v) c--;
+    if (v & 0x0000FFFF) c -= 16;
+    if (v & 0x00FF00FF) c -= 8;
+    if (v & 0x0F0F0F0F) c -= 4;
+    if (v & 0x33333333) c -= 2;
+    if (v & 0x55555555) c -= 1;
+    return c;
+  }
+  if (sizeof (T) <= 8)
+  {
+    /* "bithacks" */
+    unsigned int c = 64;
+    v &= - (int64_t) (v);
+    if (v) c--;
+    if (v & 0x00000000FFFFFFFF) c -= 32;
+    if (v & 0x0000FFFF0000FFFF) c -= 16;
+    if (v & 0x00FF00FF00FF00FF) c -= 8;
+    if (v & 0x0F0F0F0F0F0F0F0F) c -= 4;
+    if (v & 0x3333333333333333) c -= 2;
+    if (v & 0x5555555555555555) c -= 1;
+    return c;
+  }
+  if (sizeof (T) == 16)
+  {
+    unsigned int shift = 64;
+    return (uint64_t) v ? _hb_bit_storage<uint64_t> ((uint64_t) v) :
+			  _hb_bit_storage<uint64_t> ((uint64_t) v >> shift) + shift;
+  }
+
+  assert (0);
 }
 
 static inline bool
diff --git a/src/hb-set-private.hh b/src/hb-set-private.hh
index 0c73d32..49cd791 100644
--- a/src/hb-set-private.hh
+++ b/src/hb-set-private.hh
@@ -109,21 +109,16 @@
       unsigned int i = m / ELT_BITS;
       unsigned int j = m & ELT_MASK;
 
-      for (; j < ELT_BITS; j++)
-        if (v[i] & (elt_t (1) << j))
-	  goto found;
-      for (i++; i < len (); i++)
-        if (v[i])
-	  for (j = 0; j < ELT_BITS; j++)
-	    if (v[i] & (elt_t (1) << j))
-	      goto found;
+      const elt_t vv = v[i] & ~((elt_t (1) << j) - 1);
+      for (const elt_t *p = &vv; i < len (); p = &v[++i])
+	if (*p)
+	{
+	  *codepoint = i * ELT_BITS + elt_get_min (*p);
+	  return true;
+	}
 
       *codepoint = INVALID;
       return false;
-
-    found:
-      *codepoint = i * ELT_BITS + j;
-      return true;
     }
     inline bool previous (hb_codepoint_t *codepoint) const
     {
@@ -136,52 +131,39 @@
       unsigned int i = m / ELT_BITS;
       unsigned int j = m & ELT_MASK;
 
-      for (; (int) j >= 0; j--)
-        if (v[i] & (elt_t (1) << j))
-	  goto found;
-      for (i--; (int) i >= 0; i--)
-        if (v[i])
-	  for (j = ELT_BITS - 1; (int) j >= 0; j--)
-	    if (v[i] & (elt_t (1) << j))
-	      goto found;
+      const elt_t vv = v[i] & ((elt_t (1) << (j + 1)) - 1);
+      for (const elt_t *p = &vv; (int) i >= 0; p = &v[--i])
+	if (*p)
+	{
+	  *codepoint = i * ELT_BITS + elt_get_max (*p);
+	  return true;
+	}
 
       *codepoint = INVALID;
       return false;
-
-    found:
-      *codepoint = i * ELT_BITS + j;
-      return true;
     }
     inline hb_codepoint_t get_min (void) const
     {
       for (unsigned int i = 0; i < len (); i++)
         if (v[i])
-	{
-	  elt_t e = v[i];
-	  for (unsigned int j = 0; j < ELT_BITS; j++)
-	    if (e & (elt_t (1) << j))
-	      return i * ELT_BITS + j;
-	}
+	  return i * ELT_BITS + elt_get_min (v[i]);
       return INVALID;
     }
     inline hb_codepoint_t get_max (void) const
     {
       for (int i = len () - 1; i >= 0; i--)
         if (v[i])
-	{
-	  elt_t e = v[i];
-	  for (int j = ELT_BITS - 1; j >= 0; j--)
-	    if (e & (elt_t (1) << j))
-	      return i * ELT_BITS + j;
-	}
+	  return i * ELT_BITS + elt_get_max (v[i]);
       return 0;
     }
 
-    typedef uint32_t elt_t;
-    static const unsigned int ELT_BITS = sizeof (elt_t) * 8;
-    static const unsigned int PAGE_BITS = ELT_BITS * ELT_BITS; /* 1024. Use to tune. */
+    typedef unsigned long long elt_t;
+    static const unsigned int PAGE_BITS = 1024;
     static_assert ((PAGE_BITS & ((PAGE_BITS) - 1)) == 0, "");
 
+    static inline unsigned int elt_get_min (const elt_t &elt) { return _hb_ctz (elt); }
+    static inline unsigned int elt_get_max (const elt_t &elt) { return _hb_bit_storage (elt) - 1; }
+
 #if 0 && HAVE_VECTOR_SIZE
     /* The vectorized version does not work with clang as non-const
      * elt() errs "non-const reference cannot bind to vector element". */
@@ -192,6 +174,7 @@
 
     vector_t v;
 
+    static const unsigned int ELT_BITS = sizeof (elt_t) * 8;
     static const unsigned int ELT_MASK = ELT_BITS - 1;
     static const unsigned int BITS = sizeof (vector_t) * 8;
     static const unsigned int MASK = BITS - 1;
diff --git a/test/api/hb-subset-test.h b/test/api/hb-subset-test.h
index e862567..49e5db4 100644
--- a/test/api/hb-subset-test.h
+++ b/test/api/hb-subset-test.h
@@ -86,7 +86,7 @@
     return face;
   }
   g_assert (false);
-  return NULL;
+  return NULL; /* Shut up, compiler! */
 }
 
 static inline hb_face_t *
@@ -115,7 +115,7 @@
 {
   hb_blob_t *expected_blob = hb_face_reference_table (expected, table);
   hb_blob_t *actual_blob = hb_face_reference_table (actual, table);
-  hb_test_assert_blob_eq(expected_blob, actual_blob);  
+  hb_test_assert_blobs_equal (expected_blob, actual_blob);
   hb_blob_destroy (expected_blob);
   hb_blob_destroy (actual_blob);
 }
diff --git a/test/api/hb-test.h b/test/api/hb-test.h
index 48ccc3b..a88b00c 100644
--- a/test/api/hb-test.h
+++ b/test/api/hb-test.h
@@ -160,8 +160,14 @@
 #if !GLIB_CHECK_VERSION(2,30,0)
 #define g_test_fail() g_error("Test failed")
 #endif
+#ifndef g_assert_true
+#define g_assert_true g_assert
+#endif
+#ifndef g_assert_cmpmem
+#define g_assert_cmpmem(m1, l1, m2, l2) g_assert_true (l1 == l2 && memcmp (m1, m2, l1) == 0)
+#endif
 
-static inline void hb_test_assert_blob_eq(hb_blob_t *expected_blob, hb_blob_t *actual_blob)
+static inline void hb_test_assert_blobs_equal (hb_blob_t *expected_blob, hb_blob_t *actual_blob)
 {
   unsigned int expected_length, actual_length;
   const char *raw_expected = hb_blob_get_data (expected_blob, &expected_length);
@@ -170,7 +176,6 @@
   g_assert_cmpint(0, ==, memcmp(raw_expected, raw_actual, expected_length));
 }
 
-
 static inline void
 hb_test_add_func (const char *test_path,
 		  hb_test_func_t   test_func)