Optimized bit functions, added intrinsics for GCC and Clang

Code by @medranSolus
diff --git a/include/vk_mem_alloc.h b/include/vk_mem_alloc.h
index 97e321c..c03fe85 100644
--- a/include/vk_mem_alloc.h
+++ b/include/vk_mem_alloc.h
@@ -3064,12 +3064,18 @@
 // Returns number of bits set to 1 in (v).

 static inline uint32_t VmaCountBitsSet(uint32_t v)

 {

+#ifdef _MSC_VER

+    return __popcnt(v);

+#elif defined __GNUC__ || defined __clang__

+    return static_cast<uint32_t>(__builtin_popcount(v));

+#else

     uint32_t c = v - ((v >> 1) & 0x55555555);

     c = ((c >> 2) & 0x33333333) + (c & 0x33333333);

     c = ((c >> 4) + c) & 0x0F0F0F0F;

     c = ((c >> 8) + c) & 0x00FF00FF;

     c = ((c >> 16) + c) & 0x0000FFFF;

     return c;

+#endif

 }

 

 static inline uint8_t VmaBitScanLSB(uint64_t mask)

@@ -3078,15 +3084,20 @@
     unsigned long pos;

     if (_BitScanForward64(&pos, mask))

         return static_cast<uint8_t>(pos);

+    return UINT8_MAX;

+#elif defined __GNUC__ || defined __clang__

+    return static_cast<uint8_t>__builtin_ffsll(mask)) - 1U;

 #else

     uint8_t pos = 0;

+    uint64_t bit = 1;

     do

     {

-        if (mask & (1ULL << pos))

+        if (mask & bit)

             return pos;

+        bit <<= 1;

     } while (pos++ < 63);

-#endif

     return UINT8_MAX;

+#endif

 }

 

 static inline uint8_t VmaBitScanLSB(uint32_t mask)

@@ -3095,15 +3106,20 @@
     unsigned long pos;

     if (_BitScanForward(&pos, mask))

         return static_cast<uint8_t>(pos);

+    return UINT8_MAX;

+#elif defined __GNUC__ || defined __clang__

+    return static_cast<uint8_t>__builtin_ffsl(mask)) - 1U;

 #else

     uint8_t pos = 0;

+    uint32_t bit = 1;

     do

     {

-        if (mask & (1UL << pos))

+        if (mask & bit)

             return pos;

+        bit <<= 1;

     } while (pos++ < 31);

-#endif

     return UINT8_MAX;

+#endif

 }

 

 static inline uint8_t VmaBitScanMSB(uint64_t mask)

@@ -3112,12 +3128,17 @@
     unsigned long pos;

     if (_BitScanReverse64(&pos, mask))

         return static_cast<uint8_t>(pos);

+#elif defined __GNUC__ || defined __clang__

+    if (mask)

+        return static_cast<uint8_t>(__builtin_clzll(mask));

 #else

     uint8_t pos = 63;

+    uint64_t bit = 1 << 63;

     do

     {

-        if (mask & (1ULL << pos))

+        if (mask & bit)

             return pos;

+        bit >>= 1;

     } while (pos-- > 0);

 #endif

     return UINT8_MAX;

@@ -3129,12 +3150,17 @@
     unsigned long pos;

     if (_BitScanReverse(&pos, mask))

         return static_cast<uint8_t>(pos);

+#elif defined __GNUC__ || defined __clang__

+    if (mask)

+        return static_cast<uint8_t>(__builtin_clzl(mask));

 #else

     uint8_t pos = 31;

+    uint32_t bit = 1 << 31;

     do

     {

-        if (mask & (1UL << pos))

+        if (mask & bit)

             return pos;

+        bit >>= 1;

     } while (pos-- > 0);

 #endif

     return UINT8_MAX;